[PHI Decoupling]Remove memory header (Part2) (#50870)

* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs

[PHI Decoupling]Remove memory header (Part2) (#50870)
* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs
558068cc · YuanRisheng · GitHub · d9fb639c · 558068cc · 558068cc
90 changed file
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -257,6 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
    return Copy(place_dst, dst, src_place, src, num);
  }
 }
+
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -133,7 +133,7 @@ endif()
 cc_library(
  init
  SRCS init.cc
-  DEPS device_context custom_kernel context_pool)
+  DEPS device_context custom_kernel context_pool memcpy)

 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -55,7 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"

@@ -469,6 +469,14 @@ void InitMemoryMethod() {
    memory_method->in_same_stream = paddle::memory::InSameStream;
    memory_method->allocation_deleter =
        paddle::memory::allocation::Allocator::AllocationDeleter;
+#if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
+    defined(PADDLE_WITH_HIP)
+    memory_method->copy_with_stream =
+        paddle::memory::Copy<phi::Place, phi::Place>;
+#endif
+    memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
+    memory_method->device_memory_stat_current_value =
+        paddle::memory::DeviceMemoryStatCurrentValue;
    memory_utils.Init(std::move(memory_method));
  });
 }

--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -47,6 +47,27 @@ void AllocationDeleter(Allocation* allocation) {
  MemoryUtils::Instance().AllocationDeleter(allocation);
 }

+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num,
+          void* stream) {
+  MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num, stream);
+}
+
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num) {
+  MemoryUtils::Instance().Copy(dst_place, dst, src_place, src, num);
+}
+
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
+  return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
+                                                              dev_id);
+}
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -77,6 +77,42 @@ struct MemoryInterface {
   * @param[Allocation] allocation  the allocation to be freed
   */
  void (*allocation_deleter)(Allocation* allocation);
+
+  /**
+   * @brief   Copy memory from one place to another place.
+   *
+   * @param[Place]  DstPlace Destination allocation place (CPU or GPU or XPU or
+   * CustomDevice).
+   * @param[void*]  dst      Destination memory address.
+   * @param[Place]  SrcPlace Source allocation place (CPU or GPU or XPU or
+   * CustomDevice).
+   * @param[void*]  src      Source memory address.
+   * @param[size_t]  num      memory size in bytes to copy.
+   * @param[void*]  stream   stream for asynchronously memory copy.
+   *
+   * @note    For GPU/XPU/CustomDevice memory copy, stream need to be specified
+   *          for asynchronously memory copy, and type is restored in the
+   *          implementation.
+   *
+   */
+  void (*copy)(
+      Place dst_place, void* dst, Place src_place, const void* src, size_t num);
+  void (*copy_with_stream)(Place dst_place,
+                           void* dst,
+                           Place src_place,
+                           const void* src,
+                           size_t num,
+                           void* stream);
+
+  /**
+   * @brief get the device STAT value
+   *
+   * @param[std::string] stat_type  memory's stat type, can be 'Allocated' or
+   * 'Reserved'
+   * @param[int]stream   device id
+   */
+  int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
+                                              int dev_id);
 };

 class MemoryUtils {
@@ -156,6 +192,48 @@ class MemoryUtils {
    return memory_method_->allocation_deleter(allocation);
  }

+  void Copy(const Place& dst_place,
+            void* dst,
+            const Place& src_place,
+            const void* src,
+            size_t num,
+            void* stream) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(memory_method_->copy_with_stream,
+                      nullptr,
+                      phi::errors::Unavailable(
+                          "copy_with_stream method in memory_method_ is not "
+                          "initiazed yet. You need init it first."));
+    memory_method_->copy_with_stream(
+        dst_place, dst, src_place, src, num, stream);
+  }
+
+  void Copy(const Place& dst_place,
+            void* dst,
+            const Place& src_place,
+            const void* src,
+            size_t num) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->copy,
+        nullptr,
+        phi::errors::Unavailable("copy method in memory_method_ is not "
+                                 "initiazed yet. You need init it first."));
+    memory_method_->copy(dst_place, dst, src_place, src, num);
+  }
+
+  int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type,
+                                       int dev_id) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->device_memory_stat_current_value,
+        nullptr,
+        phi::errors::Unavailable(
+            "device_memory_stat_current_value method in memory_method_ is not "
+            "initiazed yet. You need init it first."));
+    return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
+  }
+
  void CheckMemoryMethod() {
    PADDLE_ENFORCE_NE(
        memory_method_.get(),
@@ -199,6 +277,18 @@ bool InSameStream(const std::shared_ptr<Allocation>& allocation,

 void AllocationDeleter(Allocation* allocation);

+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num,
+          void* stream);
+void Copy(const Place& dst_place,
+          void* dst,
+          const Place& src_place,
+          const void* src,
+          size_t num);
+int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>

 #include "glog/logging.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/utils/none.h"
@@ -41,12 +40,12 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
  auto stream = dev_ctx->stream();
  void *src = (*gpu_)->ptr();
  void *dst = cpu_->data();
-  paddle::memory::Copy(phi::CPUPlace(),
-                       dst,
-                       OptionalCUDAPlace(*gpu_).get(),
-                       src,
-                       *gpu_memory_size_,
-                       stream);
+  memory_utils::Copy(phi::CPUPlace(),
+                     dst,
+                     OptionalCUDAPlace(*gpu_).get(),
+                     src,
+                     *gpu_memory_size_,
+                     stream);
  dev_ctx->Wait();
 #endif
 }
@@ -64,12 +63,12 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
  auto *dev_ctx = static_cast<phi::GPUContext *>(
      phi::DeviceContextPool::Instance().Get(place));
  auto stream = dev_ctx->stream();
-  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
-                       dst,
-                       phi::CPUPlace(),
-                       src,
-                       *gpu_memory_size_,
-                       stream);
+  memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
+                     dst,
+                     phi::CPUPlace(),
+                     src,
+                     *gpu_memory_size_,
+                     stream);
  dev_ctx->Wait();
 #endif
 }

--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -13,12 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/core/selected_rows_impl.h"
-
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/utils/data_type.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {

 struct ReAllocateVisitor {

--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -16,11 +16,10 @@ limitations under the License. */

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"

 namespace phi {
@@ -99,13 +98,13 @@ void Copy(const Context& dev_ctx,

  if (src_place.GetType() == AllocationType::CPU &&
      dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(src_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  } else if ((src_place.GetType() == AllocationType::CPU ||
              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
             (dst_place.GetType() == AllocationType::CPU ||
              dst_place.GetType() == AllocationType::GPUPINNED)) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CPU) {
    auto src_gpu_place = src_place;
@@ -128,7 +127,7 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  } else if ((src_place.GetType() == AllocationType::CPU ||
              src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
@@ -153,7 +152,7 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::GPU &&  // NOLINT
             dst_place.GetType() == AllocationType::GPU) {
@@ -170,16 +169,16 @@ void Copy(const Context& dev_ctx,
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    if (src_place.GetType() == dst_place.GetType()) {
-      paddle::memory::Copy(
+      memory_utils::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
    } else {
      if (ctx_place.GetType() == src_place.GetType()) {
-        paddle::memory::Copy(
+        memory_utils::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
      } else if (ctx_place.GetType() == dst_place.GetType()) {
        phi::DeviceContextPool::Instance().Get(src.place())->Wait();
-        paddle::memory::Copy(
+        memory_utils::Copy(
            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
      } else {
        PADDLE_THROW(errors::Unavailable(
@@ -208,16 +207,16 @@ void Copy(const Context& dev_ctx,
    auto stream =
        blocking ? nullptr
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
 #endif
 #ifdef PADDLE_WITH_XPU
  } else if (src_place.GetType() == AllocationType::XPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else if (src_place.GetType() == AllocationType::CPU &&
             dst_place.GetType() == AllocationType::XPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  } else if (src_place.GetType() == AllocationType::XPU &&
             dst_place.GetType() == AllocationType::XPU) {
    if (src_ptr == dst_ptr) {
@@ -225,7 +224,7 @@ void Copy(const Context& dev_ctx,
              << dst_place;
      return;
    }
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
@@ -234,21 +233,21 @@ void Copy(const Context& dev_ctx,
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::CPU &&  // NOLINT
             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  } else if (src_place.GetType() == AllocationType::CUSTOM &&  // NOLINT
             dst_place.GetType() == AllocationType::CUSTOM) {
    auto stream =
        blocking
            ? nullptr
            : reinterpret_cast<const phi::CustomContext&>(dev_ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
 #endif
  } else {
    PADDLE_THROW(errors::Unimplemented(
@@ -425,22 +424,21 @@ void TensorFromVector(const std::vector<T>& src,
  auto size = src.size() * sizeof(T);

  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const phi::GPUContext&>(ctx).stream());
+    memory_utils::Copy(dst_place,
+                       dst_ptr,
+                       src_place,
+                       src_ptr,
+                       size,
+                       reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_place,
        dst_ptr,
        src_place,
@@ -451,7 +449,7 @@ void TensorFromVector(const std::vector<T>& src,
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -480,28 +478,27 @@ void TensorFromVector(const std::vector<bool>& src,
  auto size = src.size() * sizeof(bool);

  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const phi::GPUContext&>(ctx).stream());
+    memory_utils::Copy(dst_place,
+                       dst_ptr,
+                       src_place,
+                       src_ptr,
+                       size,
+                       reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
    auto stream = reinterpret_cast<const phi::CustomContext&>(ctx).stream();
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
  }
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -573,22 +570,21 @@ void TensorFromArray(const T* src,
  auto size = array_size * sizeof(T);

  if (dst_place.GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const phi::GPUContext&>(ctx).stream());
+    memory_utils::Copy(dst_place,
+                       dst_ptr,
+                       src_place,
+                       src_ptr,
+                       size,
+                       reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (dst_place.GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
+    memory_utils::Copy(
        dst_place,
        dst_ptr,
        src_place,
@@ -599,7 +595,7 @@ void TensorFromArray(const T* src,
 #endif
 #ifdef PADDLE_WITH_XPU
  else if (dst_place.GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #endif
  else {  // NOLINT
@@ -674,28 +670,26 @@ void TensorToVector(const phi::DenseTensor& src,
  auto dst_ptr = static_cast<void*>(dst->data());

  if (src.place().GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const phi::GPUContext&>(ctx).stream());
+    memory_utils::Copy(dst_place,
+                       dst_ptr,
+                       src.place(),
+                       src_ptr,
+                       size,
+                       reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
  else {  // NOLINT
@@ -718,28 +712,26 @@ void TensorToVector(const phi::DenseTensor& src,
  auto dst_ptr = static_cast<void*>(array);

  if (src.place().GetType() == AllocationType::CPU) {
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const phi::GPUContext&>(ctx).stream());
+    memory_utils::Copy(dst_place,
+                       dst_ptr,
+                       src.place(),
+                       src_ptr,
+                       size,
+                       reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (src.place().GetType() == AllocationType::XPU) {  // NOLINT
-    paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
  else if (src.place().GetType() == AllocationType::CUSTOM) {  // NOLINT
-    paddle::memory::Copy(
-        dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
+    memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
  for (unsigned int i = 0; i < src.numel(); i++) {
@@ -800,7 +792,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<T>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));

-  paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+  memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
 }

 template <>
@@ -821,7 +813,7 @@ void TensorToVector(const phi::DenseTensor& src, std::vector<bool>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));

-  paddle::memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
+  memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);

  for (unsigned int i = 0; i < src.numel(); i++) {
    (*dst)[i] = static_cast<bool>(array[i]);

--- a/paddle/phi/kernels/cpu/index_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_add_kernel.cc
@@ -13,10 +13,9 @@
 // limitations under the License.

 #include "paddle/phi/kernels/index_add_kernel.h"
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
-// #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/cpu/index_add_impl.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"


--- a/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_grad_kernel.cc
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/multiplex_grad_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

@@ -43,11 +43,11 @@ void MultiplexGradKernel(const Context& ctx,
  for (auto i = 0; i < rows; i++) {
    size_t k = static_cast<size_t>(index[i]);
    if (ins_grad[k]) {
-      paddle::memory::Copy(ctx.GetPlace(),
-                           ins_grad[k]->data<T>() + i * cols,
-                           ctx.GetPlace(),
-                           out_grad.data<T>() + i * cols,
-                           cols * sizeof(T));
+      memory_utils::Copy(ctx.GetPlace(),
+                         ins_grad[k]->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         out_grad.data<T>() + i * cols,
+                         cols * sizeof(T));
    }
  }
 }

--- a/paddle/phi/kernels/cpu/multiplex_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/multiplex_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -45,11 +45,11 @@ void MultiplexKernel(const Context& ctx,
                      ins.size(),
                      errors::PreconditionNotMet(
                          "index exceeds the number of candidate tensors."));
-    paddle::memory::Copy(ctx.GetPlace(),
-                         out->data<T>() + i * cols,
-                         ctx.GetPlace(),
-                         ins[k]->data<T>() + i * cols,
-                         cols * sizeof(T));
+    memory_utils::Copy(ctx.GetPlace(),
+                       out->data<T>() + i * cols,
+                       ctx.GetPlace(),
+                       ins[k]->data<T>() + i * cols,
+                       cols * sizeof(T));
  }
 }


--- a/paddle/phi/kernels/funcs/adam_functors.h
+++ b/paddle/phi/kernels/funcs/adam_functors.h
@@ -22,8 +22,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #endif

 namespace phi {
@@ -45,13 +44,13 @@ static int ConvertDataByType(

  T1* cpu_data = reinterpret_cast<T1*>(malloc(sizeof(T1) * len));

-  paddle::memory::Copy(
+  memory_utils::Copy(
      CPUPlace(), cpu_data, dev_ctx.GetPlace(), x, len * sizeof(T1));

  T2* cpu_real_data = reinterpret_cast<T2*>(malloc(sizeof(T2) * len));
  for (int i = 0; i < len; i++) cpu_real_data[i] = static_cast<T2>(cpu_data[i]);

-  paddle::memory::Copy(
+  memory_utils::Copy(
      dev_ctx.GetPlace(), *y, CPUPlace(), cpu_real_data, len * sizeof(T2));

  free(cpu_data);

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc
@@ -57,11 +57,11 @@ struct ConcatFunctor<phi::CPUContext, T> {
      int64_t col_len = input_cols[j];
      auto input_data = input[j].data<T>();
      for (int64_t k = 0; k < out_rows; ++k) {
-        paddle::memory::Copy(cpu_place,
-                             output_data + k * out_cols + col_idx,
-                             cpu_place,
-                             input_data + k * col_len,
-                             sizeof(T) * col_len);
+        memory_utils::Copy(cpu_place,
+                           output_data + k * out_cols + col_idx,
+                           cpu_place,
+                           input_data + k * col_len,
+                           sizeof(T) * col_len);
      }
      col_idx += col_len;
    }
@@ -114,11 +114,11 @@ struct SplitFunctor<phi::CPUContext, T> {
        auto* out_tensor = outputs->at(j);
        if (out_tensor != nullptr) {
          T* dst_ptr = out_tensor->data<T>() + k * col_len;
-          paddle::memory::Copy(cpu_place,
-                               dst_ptr,
-                               cpu_place,
-                               src_ptr + col_idx,
-                               sizeof(T) * col_len);
+          memory_utils::Copy(cpu_place,
+                             dst_ptr,
+                             cpu_place,
+                             src_ptr + col_idx,
+                             sizeof(T) * col_len);
        }
        col_idx += col_len;
      }

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"

 namespace phi {
@@ -105,12 +106,12 @@ struct PointerToPointer {
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
    auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
        pre_alloced_host_ptr, in_num);
-    paddle::memory::Copy(ctx.GetPlace(),
-                         (*dev_ins_ptr)->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         in_num * sizeof(T*),
-                         ctx.stream());
+    memory_utils::Copy(ctx.GetPlace(),
+                       (*dev_ins_ptr)->ptr(),
+                       phi::CPUPlace(),
+                       restored,
+                       in_num * sizeof(T*),
+                       ctx.stream());
    ins_addr = reinterpret_cast<void**>((*dev_ins_ptr)->ptr());
  }
 };
@@ -155,12 +156,12 @@ struct PointerToPointerAndCol {
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
    auto* restored = phi::backends::gpu::RestoreHostMemIfCapturingCUDAGraph(
        inputs_col, inputs_col_num);
-    paddle::memory::Copy(ctx.GetPlace(),
-                         (*dev_col_ptr)->ptr(),
-                         phi::CPUPlace(),
-                         restored,
-                         inputs_col_num * sizeof(IndexT),
-                         ctx.stream());
+    memory_utils::Copy(ctx.GetPlace(),
+                       (*dev_col_ptr)->ptr(),
+                       phi::CPUPlace(),
+                       restored,
+                       inputs_col_num * sizeof(IndexT),
+                       ctx.stream());
    col_length = static_cast<IndexT*>((*dev_col_ptr)->ptr());
    ins_ptr_wrapper =
        PointerToPointer<T>(ctx, ins, pre_alloced_host_ptr, dev_ins_ptr);
@@ -570,11 +571,11 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
  IndexT* inputs_col = inputs_col_vec.data();
 #ifdef PADDLE_WITH_HIP
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
-      paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
+  phi::Allocator::AllocationPtr data_alloc =
+      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
-      paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
+      phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
 #endif

@@ -786,11 +787,11 @@ void SplitFunctorDispatchWithIndexType(
 #ifdef PADDLE_WITH_HIP
  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                        out_num * sizeof(T*));
+  data_alloc =
+      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
+  cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
                                        (out_cols_num) * sizeof(IndexT));
  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
 #endif

--- a/paddle/phi/kernels/funcs/concat_and_split_functor.h
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h
@@ -19,13 +19,11 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/utils/data_type.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-
 namespace phi {
 namespace funcs {


--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/device_context.h"

@@ -39,12 +39,12 @@ struct StridedMemcpyFunctor<T, 0> {
    auto place = dev_ctx.GetPlace();
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
+      memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      paddle::memory::Copy(
+      memory_utils::Copy(
          gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
 #else
      PADDLE_THROW(
@@ -65,18 +65,18 @@ struct StridedMemcpyFunctor<T, 1> {
    auto place = dev_ctx.GetPlace();
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(
+      memory_utils::Copy(
          cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
-      paddle::memory::Copy(gpu_place,
-                           dst,
-                           gpu_place,
-                           src,
-                           sizeof(T) * dst_dim[0],
-                           cuda_ctx.stream());
+      memory_utils::Copy(gpu_place,
+                         dst,
+                         gpu_place,
+                         src,
+                         sizeof(T) * dst_dim[0],
+                         cuda_ctx.stream());
 #else
      PADDLE_THROW(
          phi::errors::Unavailable("Paddle is not compiled with GPU."));

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -23,8 +23,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/for_range.h"

 #if defined(__NVCC__) || defined(__HIPCC__)
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -1544,19 +1542,19 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  int *out_dims_array_gpu =
      reinterpret_cast<int *>(y_strides_array_gpu + max_dim);

-  paddle::memory::Copy(gplace,
-                       x_strides_array_gpu,
-                       cplace,
-                       x_strides_array.data(),
-                       bytes,
-                       ctx.stream());
-  paddle::memory::Copy(gplace,
-                       y_strides_array_gpu,
-                       cplace,
-                       y_strides_array.data(),
-                       bytes,
-                       ctx.stream());
-  paddle::memory::Copy(
+  memory_utils::Copy(gplace,
+                     x_strides_array_gpu,
+                     cplace,
+                     x_strides_array.data(),
+                     bytes,
+                     ctx.stream());
+  memory_utils::Copy(gplace,
+                     y_strides_array_gpu,
+                     cplace,
+                     y_strides_array.data(),
+                     bytes,
+                     ctx.stream());
+  memory_utils::Copy(
      gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());

  const int out_size = std::accumulate(
@@ -1573,18 +1571,18 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
    int *x_dims_order_gpu =
        reinterpret_cast<int *>(x_strides_order_gpu + max_dim);

-    paddle::memory::Copy(gplace,
-                         x_strides_order_gpu,
-                         cplace,
-                         x_strides_order.data(),
-                         bytes,
-                         ctx.stream());
-    paddle::memory::Copy(gplace,
-                         x_dims_order_gpu,
-                         cplace,
-                         x_dims_order.data(),
-                         bytes,
-                         ctx.stream());
+    memory_utils::Copy(gplace,
+                       x_strides_order_gpu,
+                       cplace,
+                       x_strides_order.data(),
+                       bytes,
+                       ctx.stream());
+    memory_utils::Copy(gplace,
+                       x_dims_order_gpu,
+                       cplace,
+                       x_dims_order.data(),
+                       bytes,
+                       ctx.stream());
    CommonGradBroadcastCUDAKernel<T, DX_OP, Tout>
        <<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
                                                      y_strides_array_gpu,
@@ -1612,18 +1610,18 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
    int *y_dims_order_gpu =
        reinterpret_cast<int *>(y_strides_order_gpu + max_dim);

-    paddle::memory::Copy(gplace,
-                         y_strides_order_gpu,
-                         cplace,
-                         y_strides_order.data(),
-                         bytes,
-                         ctx.stream());
-    paddle::memory::Copy(gplace,
-                         y_dims_order_gpu,
-                         cplace,
-                         y_dims_order.data(),
-                         bytes,
-                         ctx.stream());
+    memory_utils::Copy(gplace,
+                       y_strides_order_gpu,
+                       cplace,
+                       y_strides_order.data(),
+                       bytes,
+                       ctx.stream());
+    memory_utils::Copy(gplace,
+                       y_dims_order_gpu,
+                       cplace,
+                       y_dims_order.data(),
+                       bytes,
+                       ctx.stream());
    CommonGradBroadcastCUDAKernel<T, DY_OP, Tout>
        <<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
                                                      y_strides_array_gpu,

--- a/paddle/phi/kernels/funcs/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 // TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
@@ -200,7 +199,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
    cpu_buf[rank + i] = out_stride[i];
    cpu_buf[2 * rank + i] = axis[i];
  }
-  paddle::memory::Copy(
+  memory_utils::Copy(
      cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
  REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
  REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
@@ -243,7 +242,7 @@ struct TransposeNormal<phi::GPUContext, T> {
      cpu_buf[rank + i] = out_stride[i];
      cpu_buf[2 * rank + i] = axis[i];
    }
-    paddle::memory::Copy(
+    memory_utils::Copy(
        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);

--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -119,11 +119,11 @@ struct TensorSetConstantXPU {
    int numel = tensor_->numel();
    std::unique_ptr<T[]> data_cpu(new T[numel]);
    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
-    paddle::memory::Copy(place_,
-                         begin,
-                         phi::CPUPlace(),
-                         static_cast<void*>(data_cpu.get()),
-                         numel * sizeof(T));
+    memory_utils::Copy(place_,
+                       begin,
+                       phi::CPUPlace(),
+                       static_cast<void*>(data_cpu.get()),
+                       numel * sizeof(T));
  }
  phi::DenseTensor* tensor_;
  U value_;

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/matrix_inverse.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

 namespace phi {
@@ -39,12 +39,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
        dev_ctx.GetPlace(),
        a.numel() * sizeof(T),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         tmp_gpu_mat_data->ptr(),
-                         dev_ctx.GetPlace(),
-                         a.data(),
-                         a.numel() * sizeof(T),
-                         dev_ctx.stream());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_mat_data->ptr(),
+                       dev_ctx.GetPlace(),
+                       a.data(),
+                       a.numel() * sizeof(T),
+                       dev_ctx.stream());
    gpu_mat = reinterpret_cast<const T*>(tmp_gpu_mat_data->ptr());
  }

@@ -62,12 +62,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
      dev_ctx.GetPlace(),
      total_bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       tmp_gpu_ptrs_data->ptr(),
-                       phi::CPUPlace(),
-                       static_cast<void*>(cpu_ptrs.data()),
-                       cpu_ptrs.size() * sizeof(T*),
-                       dev_ctx.stream());
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     tmp_gpu_ptrs_data->ptr(),
+                     phi::CPUPlace(),
+                     static_cast<void*>(cpu_ptrs.data()),
+                     cpu_ptrs.size() * sizeof(T*),
+                     dev_ctx.stream());
  T** gpu_inv_pivot_info = reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr());
  T** gpu_inv_ptrs = gpu_inv_pivot_info + batch_size;
  int* gpu_info_ptr =
@@ -107,12 +107,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                      gpu_info_ptr,
                      batch_size);
  }
-  paddle::memory::Copy(phi::CPUPlace(),
-                       info.data(),
-                       dev_ctx.GetPlace(),
-                       gpu_info_ptr,
-                       sizeof(int) * batch_size,
-                       dev_ctx.stream());
+  memory_utils::Copy(phi::CPUPlace(),
+                     info.data(),
+                     dev_ctx.GetPlace(),
+                     gpu_info_ptr,
+                     sizeof(int) * batch_size,
+                     dev_ctx.stream());
  for (int i = 0; i < batch_size; ++i) {
    PADDLE_ENFORCE_EQ(info[i],
                      0,

--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -84,12 +84,12 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
      context.GetPlace(),
      cpu_ptrs.size() * sizeof(T*),
      phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
-  paddle::memory::Copy(context.GetPlace(),
-                       tmp_gpu_ptrs_data->ptr(),
-                       phi::CPUPlace(),
-                       static_cast<void*>(cpu_ptrs.data()),
-                       cpu_ptrs.size() * sizeof(T*),
-                       context.stream());
+  memory_utils::Copy(context.GetPlace(),
+                     tmp_gpu_ptrs_data->ptr(),
+                     phi::CPUPlace(),
+                     static_cast<void*>(cpu_ptrs.data()),
+                     cpu_ptrs.size() * sizeof(T*),
+                     context.stream());

  T** gpu_tmp_b_ptrs =
      reinterpret_cast<T**>(tmp_gpu_ptrs_data->ptr()) + batch_size;
@@ -121,12 +121,12 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
                    batch_size);

  // check whether BatchedGETRF is executed successfully or not
-  paddle::memory::Copy(phi::CPUPlace(),
-                       info.data(),
-                       context.GetPlace(),
-                       gpu_info_ptr,
-                       sizeof(int) * batch_size,
-                       context.stream());
+  memory_utils::Copy(phi::CPUPlace(),
+                     info.data(),
+                     context.GetPlace(),
+                     gpu_info_ptr,
+                     sizeof(int) * batch_size,
+                     context.stream());
  for (int i = 0; i < batch_size; ++i) {
    PADDLE_ENFORCE_EQ(info[i],
                      0,

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -25,9 +25,8 @@ namespace cub = hipcub;
 #endif

 #include <algorithm>
-
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -433,12 +432,12 @@ void SelectKernel(const KPDevice &dev_ctx,
  // 3.1 set temp ptr for in;
  // 3.1 alloc for out
  // 3.1.1 get true_num for gpu place the last cumsum is the true_num
-  paddle::memory::Copy(cpu_place,
-                       &total_true_num,
-                       cuda_place,
-                       cumsum_data + need_grids,
-                       t_size,
-                       dev_ctx.stream());
+  memory_utils::Copy(cpu_place,
+                     &total_true_num,
+                     cuda_place,
+                     cumsum_data + need_grids,
+                     t_size,
+                     dev_ctx.stream());

  dev_ctx.Wait();
  // 3.1.2 allock for out with total_true_num

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -93,18 +93,18 @@ struct SelectedRowsAdd<phi::CPUContext, T> {

    auto* out_data = out_value->data<T>();
    auto* in1_data = in1_value.data<T>();
-    paddle::memory::Copy(out_place,
-                         out_data,
-                         in1_place,
-                         in1_data,
-                         in1_value.numel() * sizeof(T));
+    memory_utils::Copy(out_place,
+                       out_data,
+                       in1_place,
+                       in1_data,
+                       in1_value.numel() * sizeof(T));

    auto* in2_data = in2_value.data<T>();
-    paddle::memory::Copy(out_place,
-                         out_data + in1_value.numel(),
-                         in2_place,
-                         in2_data,
-                         in2_value.numel() * sizeof(T));
+    memory_utils::Copy(out_place,
+                       out_data + in1_value.numel(),
+                       in2_place,
+                       in2_data,
+                       in2_value.numel() * sizeof(T));
  }
 };

@@ -219,11 +219,11 @@ struct SelectedRowsAddTo<phi::CPUContext, T> {

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
-    paddle::memory::Copy(in2_place,
-                         in2_data + input2_offset,
-                         in1_place,
-                         in1_data,
-                         in1_value.numel() * sizeof(T));
+    memory_utils::Copy(in2_place,
+                       in2_data + input2_offset,
+                       in1_place,
+                       in1_data,
+                       in1_value.numel() * sizeof(T));
  }
 };

@@ -566,11 +566,11 @@ struct MergeAddImpl {
      for (auto* in : inputs) {
        auto* in_data = in->value().data<T>();
        auto in_numel = in->rows().size() * input_width;
-        paddle::memory::Copy(out_place,
-                             out_data + copied_numel,
-                             in_place,
-                             in_data,
-                             in_numel * sizeof(T));
+        memory_utils::Copy(out_place,
+                           out_data + copied_numel,
+                           in_place,
+                           in_data,
+                           in_numel * sizeof(T));
        copied_numel += in_numel;
      }
    } else {
@@ -680,16 +680,16 @@ struct MergeAdd<phi::XPUContext, T> {
    xpu::ctx_guard RAII_GUARD(context.x_context());
    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
-    paddle::memory::Copy(context.GetPlace(),
-                         y_rows_data,
-                         phi::CPUPlace(),
-                         merge_rows.data(),
-                         ym * sizeof(int64_t));
-    paddle::memory::Copy(context.GetPlace(),
-                         x_rows_data,
-                         phi::CPUPlace(),
-                         input_rows.data(),
-                         xm * sizeof(int64_t));
+    memory_utils::Copy(context.GetPlace(),
+                       y_rows_data,
+                       phi::CPUPlace(),
+                       merge_rows.data(),
+                       ym * sizeof(int64_t));
+    memory_utils::Copy(context.GetPlace(),
+                       x_rows_data,
+                       phi::CPUPlace(),
+                       input_rows.data(),
+                       xm * sizeof(int64_t));
    int r = xpu::merge_dup_rows<T, int64_t>(context.x_context(),
                                            x_data,
                                            y_data,
@@ -778,16 +778,16 @@ struct MergeAdd<phi::XPUContext, T> {
      xpu::ctx_guard RAII_GUARD(context.x_context());
      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
-      paddle::memory::Copy(context.GetPlace(),
-                           y_rows_data,
-                           phi::CPUPlace(),
-                           merge_rows.data(),
-                           ym * sizeof(int64_t));
-      paddle::memory::Copy(context.GetPlace(),
-                           x_rows_data,
-                           phi::CPUPlace(),
-                           input_rows.data(),
-                           xm * sizeof(int64_t));
+      memory_utils::Copy(context.GetPlace(),
+                         y_rows_data,
+                         phi::CPUPlace(),
+                         merge_rows.data(),
+                         ym * sizeof(int64_t));
+      memory_utils::Copy(context.GetPlace(),
+                         x_rows_data,
+                         phi::CPUPlace(),
+                         input_rows.data(),
+                         xm * sizeof(int64_t));
      int r = xpu::merge_dup_rows<T, int64_t>(context.x_context(),
                                              x_data,
                                              y_data,

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -91,20 +91,20 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));

-    paddle::memory::Copy(out_place,
-                         out_data,
-                         in1_place,
-                         in1_data,
-                         in1_value.numel() * sizeof(T),
-                         context.stream());
+    memory_utils::Copy(out_place,
+                       out_data,
+                       in1_place,
+                       in1_data,
+                       in1_value.numel() * sizeof(T),
+                       context.stream());

    auto* in2_data = in2_value.data<T>();
-    paddle::memory::Copy(out_place,
-                         out_data + in1_value.numel(),
-                         in2_place,
-                         in2_data,
-                         in2_value.numel() * sizeof(T),
-                         context.stream());
+    memory_utils::Copy(out_place,
+                       out_data + in1_value.numel(),
+                       in2_place,
+                       in2_data,
+                       in2_value.numel() * sizeof(T),
+                       context.stream());
  }
 };

@@ -249,12 +249,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
-    paddle::memory::Copy(in2_place,
-                         in2_data + input2_offset,
-                         in1_place,
-                         in1_data,
-                         in1_value.numel() * sizeof(T),
-                         context.stream());
+    memory_utils::Copy(in2_place,
+                       in2_data + input2_offset,
+                       in1_place,
+                       in1_data,
+                       in1_value.numel() * sizeof(T),
+                       context.stream());
  }
 };


--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -104,39 +104,39 @@ inline void StridedNumelCopyWithAxis(const phi::DeviceContext& ctx,
  for (int64_t i = 0; i < before; ++i) {
    if (place.GetType() == phi::AllocationType::CPU) {
      auto& cpu_place = place;
-      paddle::memory::Copy(cpu_place,
-                           dst + i * dst_after,
-                           cpu_place,
-                           src + i * src_after,
-                           sizeof(T) * size);
+      memory_utils::Copy(cpu_place,
+                         dst + i * dst_after,
+                         cpu_place,
+                         src + i * src_after,
+                         sizeof(T) * size);
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      auto& gpu_place = place;
      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
-      paddle::memory::Copy(gpu_place,
-                           dst + i * dst_after,
-                           gpu_place,
-                           src + i * src_after,
-                           sizeof(T) * size,
-                           cuda_ctx.stream());
+      memory_utils::Copy(gpu_place,
+                         dst + i * dst_after,
+                         gpu_place,
+                         src + i * src_after,
+                         sizeof(T) * size,
+                         cuda_ctx.stream());
 #elif defined(PADDLE_WITH_ASCEND_CL)
      auto& npu_place = place;
      auto& npu_ctx = reinterpret_cast<const platform::NPUDeviceContext&>(ctx);
-      paddle::memory::Copy(npu_place,
-                           dst + i * dst_after,
-                           npu_place,
-                           src + i * src_after,
-                           sizeof(T) * size,
-                           npu_ctx.stream());
+      memory_utils::Copy(npu_place,
+                         dst + i * dst_after,
+                         npu_place,
+                         src + i * src_after,
+                         sizeof(T) * size,
+                         npu_ctx.stream());
 #elif defined(PADDLE_WITH_MLU)
      auto& mlu_place = place;
      auto& mlu_ctx = reinterpret_cast<const platform::MLUDeviceContext&>(ctx);
-      paddle::memory::Copy(mlu_place,
-                           dst + i * dst_after,
-                           mlu_place,
-                           src + i * src_after,
-                           sizeof(T) * size,
-                           mlu_ctx.stream());
+      memory_utils::Copy(mlu_place,
+                         dst + i * dst_after,
+                         mlu_place,
+                         src + i * src_after,
+                         sizeof(T) * size,
+                         mlu_ctx.stream());
 #else
      PADDLE_THROW(
          phi::errors::PreconditionNotMet("Paddle is not compiled with GPU."));

--- a/paddle/phi/kernels/funcs/tensor_to_string.h
+++ b/paddle/phi/kernels/funcs/tensor_to_string.h
@@ -16,9 +16,9 @@

 #include <sstream>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/utils/string/string_helper.h"

@@ -39,12 +39,12 @@ static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
    std::vector<CopyT> cpu_x(n);
    auto *dev_ctx = static_cast<phi::GPUContext *>(
        phi::DeviceContextPool::Instance().Get(place));
-    paddle::memory::Copy(phi::CPUPlace(),
-                         cpu_x.data(),
-                         place,
-                         x,
-                         n * sizeof(T),
-                         dev_ctx->stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       cpu_x.data(),
+                       place,
+                       x,
+                       n * sizeof(T),
+                       dev_ctx->stream());
    dev_ctx->Wait();
    return std::vector<T>(cpu_x.data(), cpu_x.data() + n);
  }

--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -13,7 +13,6 @@
 // limitations under the License.

 #pragma once
-#include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/core/errors.h"
@@ -191,12 +190,12 @@ static void CheckEighResult(const GPUContext &dev_ctx,
                            const int64_t batch_size,
                            int *info) {
  std::vector<int> error_info(batch_size);
-  paddle::memory::Copy(phi::CPUPlace(),
-                       error_info.data(),
-                       dev_ctx.GetPlace(),
-                       info,
-                       sizeof(int) * batch_size,
-                       dev_ctx.stream());
+  memory_utils::Copy(phi::CPUPlace(),
+                     error_info.data(),
+                     dev_ctx.GetPlace(),
+                     info,
+                     sizeof(int) * batch_size,
+                     dev_ctx.stream());
  dev_ctx.Wait();
  for (auto i = 0; i < batch_size; ++i) {
    CheckEighResult(i, error_info[i]);

--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/add_n_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
@@ -208,12 +207,12 @@ void AddNKernel(const Context &dev_ctx,
      auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
          dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));

-      paddle::memory::Copy(dev_ctx.GetPlace(),
-                           tmp_sr_in_out_array->ptr(),
-                           phi::CPUPlace(),
-                           reinterpret_cast<void *>(sr_in_out_data.data()),
-                           sr_in_out_data.size() * sizeof(T *),
-                           dev_ctx.stream());
+      memory_utils::Copy(dev_ctx.GetPlace(),
+                         tmp_sr_in_out_array->ptr(),
+                         phi::CPUPlace(),
+                         reinterpret_cast<void *>(sr_in_out_data.data()),
+                         sr_in_out_data.size() * sizeof(T *),
+                         dev_ctx.stream());

      T **sr_in_out_array_data =
          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
@@ -229,12 +228,12 @@ void AddNKernel(const Context &dev_ctx,
    auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
                                                 in_data.size() * sizeof(T *));

-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         tmp_in_array->ptr(),
-                         phi::CPUPlace(),
-                         reinterpret_cast<void *>(in_data.data()),
-                         in_data.size() * sizeof(T *),
-                         dev_ctx.stream());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       tmp_in_array->ptr(),
+                       phi::CPUPlace(),
+                       reinterpret_cast<void *>(in_data.data()),
+                       in_data.size() * sizeof(T *),
+                       dev_ctx.stream());

    T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
    ComputeKernelParameter(lod_length);

--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -20,8 +20,6 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"

-#include "paddle/fluid/memory/memory.h"
-
 namespace phi {

 // Utils
@@ -176,12 +174,12 @@ class LazyZeros<phi::GPUContext, T> {
    for (int i = 0; i < xs_size; i++) {
      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
    }
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         d_starts,
-                         cpu_place,
-                         h_starts,
-                         (xs_size + 1) * sizeof(int64_t),
-                         dev_ctx.stream());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       d_starts,
+                       cpu_place,
+                       h_starts,
+                       (xs_size + 1) * sizeof(int64_t),
+                       dev_ctx.stream());

    // copy each tensor of "outs" data address array to device
    auto h_out_addrs_mem =
@@ -197,12 +195,12 @@ class LazyZeros<phi::GPUContext, T> {
    for (size_t i = 0; i < xs_size; ++i) {
      h_out_addrs[i] = dev_ctx.Alloc<T>(outs[i]);
    }
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         d_out_addrs,
-                         cpu_place,
-                         h_out_addrs,
-                         xs_size * sizeof(T*),
-                         dev_ctx.stream());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       d_out_addrs,
+                       cpu_place,
+                       h_out_addrs,
+                       xs_size * sizeof(T*),
+                       dev_ctx.stream());

    // launch cuda kernel
    int64_t total_num = h_starts[xs_size];
@@ -306,12 +304,12 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
    h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
  }
  int64_t total_num = h_starts[xs_size];
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_starts,
-                       cpu_place,
-                       h_starts,
-                       (xs_size + 1) * sizeof(int64_t),
-                       dev_ctx.stream());
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     d_starts,
+                     cpu_place,
+                     h_starts,
+                     (xs_size + 1) * sizeof(int64_t),
+                     dev_ctx.stream());

  // copy each tensor's data address to device
  auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
@@ -329,12 +327,12 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
    h_xs[i] = xs[i]->data<T>();
    h_outs[i] = dev_ctx.template Alloc<T>(outs[i]);
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       d_xs,
-                       cpu_place,
-                       h_xs,
-                       2 * xs_size * sizeof(T*),
-                       dev_ctx.stream());
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     d_xs,
+                     cpu_place,
+                     h_xs,
+                     2 * xs_size * sizeof(T*),
+                     dev_ctx.stream());

  // Launch Kernel
  int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);

--- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
+++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu
@@ -30,24 +30,24 @@ void GetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
                                      int64_t* old_num_accumulates) {
  auto stream = dev_ctx.stream();
  auto cuda_place = in_old_num_accumulates.place();
-  paddle::memory::Copy(phi::CPUPlace(),
-                       old_num_accumulates,
-                       cuda_place,
-                       in_old_num_accumulates.data<int64_t>(),
-                       sizeof(int64_t),
-                       stream);
-  paddle::memory::Copy(phi::CPUPlace(),
-                       num_accumulates,
-                       cuda_place,
-                       in_num_accumulates.data<int64_t>(),
-                       sizeof(int64_t),
-                       stream);
-  paddle::memory::Copy(phi::CPUPlace(),
-                       num_updates,
-                       cuda_place,
-                       in_num_updates.data<int64_t>(),
-                       sizeof(int64_t),
-                       stream);
+  memory_utils::Copy(phi::CPUPlace(),
+                     old_num_accumulates,
+                     cuda_place,
+                     in_old_num_accumulates.data<int64_t>(),
+                     sizeof(int64_t),
+                     stream);
+  memory_utils::Copy(phi::CPUPlace(),
+                     num_accumulates,
+                     cuda_place,
+                     in_num_accumulates.data<int64_t>(),
+                     sizeof(int64_t),
+                     stream);
+  memory_utils::Copy(phi::CPUPlace(),
+                     num_updates,
+                     cuda_place,
+                     in_num_updates.data<int64_t>(),
+                     sizeof(int64_t),
+                     stream);
 }

 template <>
@@ -68,26 +68,26 @@ void SetAccumulators<phi::GPUContext>(const phi::GPUContext& dev_ctx,
  auto stream = dev_ctx.stream();

  auto cuda_place = out_old_num_accumulates->place();
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       out_num_accumulates_ptr,
-                       phi::CPUPlace(),
-                       &num_accumulates,
-                       sizeof(int64_t),
-                       stream);
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     out_num_accumulates_ptr,
+                     phi::CPUPlace(),
+                     &num_accumulates,
+                     sizeof(int64_t),
+                     stream);

-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       out_old_num_accumulates_ptr,
-                       phi::CPUPlace(),
-                       &old_num_accumulates,
-                       sizeof(int64_t),
-                       stream);
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     out_old_num_accumulates_ptr,
+                     phi::CPUPlace(),
+                     &old_num_accumulates,
+                     sizeof(int64_t),
+                     stream);

-  paddle::memory::Copy(cuda_place,
-                       out_num_updates_ptr,
-                       phi::CPUPlace(),
-                       &num_updates,
-                       sizeof(int64_t),
-                       stream);
+  memory_utils::Copy(cuda_place,
+                     out_num_updates_ptr,
+                     phi::CPUPlace(),
+                     &num_updates,
+                     sizeof(int64_t),
+                     stream);
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/box_coder_kernel.cu
+++ b/paddle/phi/kernels/gpu/box_coder_kernel.cu
@@ -17,7 +17,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -207,7 +206,7 @@ void BoxCoderKernel(const Context &dev_ctx,
  float *dev_var_data = reinterpret_cast<float *>(dev_var->ptr());
  auto cplace = phi::CPUPlace();
  const auto gplace = dev_ctx.GetPlace();
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, dev_var_data, cplace, &variance[0], bytes, dev_ctx.stream());

  output_box->Resize({row, col, len});

--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -196,12 +195,12 @@ void CholeskyKernel(const Context& dev_ctx,
  std::vector<int> error_info;  // only for checking positive matrix
  error_info.resize(batch_count);

-  paddle::memory::Copy(CPUPlace(),
-                       error_info.data(),
-                       dev_ctx.GetPlace(),
-                       info_ptr,
-                       sizeof(int) * batch_count,
-                       dev_ctx.stream());
+  memory_utils::Copy(CPUPlace(),
+                     error_info.data(),
+                     dev_ctx.GetPlace(),
+                     info_ptr,
+                     sizeof(int) * batch_count,
+                     dev_ctx.stream());

  for (int i = 0; i < batch_count; ++i) {
    PADDLE_ENFORCE_EQ(error_info[i],

--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -29,7 +29,7 @@ namespace cub = hipcub;
 #include <iterator>
 #include <random>

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"

@@ -581,12 +581,12 @@ void ClassCenterSampleKernel(const Context& dev_ctx,

  T* sampled_local_class_center_ptr =
      dev_ctx.template Alloc<T>(sampled_local_class_center);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       sampled_local_class_center_ptr,
-                       dev_ctx.GetPlace(),
-                       cub_sort_values_out_ptr,
-                       actual_num_samples * sizeof(T),
-                       nullptr);
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     sampled_local_class_center_ptr,
+                     dev_ctx.GetPlace(),
+                     cub_sort_values_out_ptr,
+                     actual_num_samples * sizeof(T),
+                     nullptr);
 }
 }  // namespace phi


--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -24,7 +24,6 @@ namespace cub = hipcub;

 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
@@ -32,7 +31,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -220,12 +219,12 @@ void DistributeFpnProposalsKernel(
  int start = 0;

  std::vector<int> sub_lod_list_cpu(lod_size * num_level);
-  paddle::memory::Copy(phi::CPUPlace(),
-                       sub_lod_list_cpu.data(),
-                       place,
-                       sub_lod_list_data,
-                       sizeof(int) * lod_size * num_level,
-                       dev_ctx.stream());
+  memory_utils::Copy(phi::CPUPlace(),
+                     sub_lod_list_cpu.data(),
+                     place,
+                     sub_lod_list_data,
+                     sizeof(int) * lod_size * num_level,
+                     dev_ctx.stream());
  dev_ctx.Wait();

  for (int i = 0; i < num_level; ++i) {

--- a/paddle/phi/kernels/gpu/edit_distance_kernel.cu
+++ b/paddle/phi/kernels/gpu/edit_distance_kernel.cu
@@ -17,9 +17,9 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -136,12 +136,12 @@ void EditDistanceKernel(const Context& ctx,
      if (normalized) {
        distance = distance / n;
      }
-      paddle::memory::Copy(ctx.GetPlace(),
-                           out_data + num,
-                           CPUPlace(),
-                           &distance,
-                           sizeof(T),
-                           stream);
+      memory_utils::Copy(ctx.GetPlace(),
+                         out_data + num,
+                         CPUPlace(),
+                         &distance,
+                         sizeof(T),
+                         stream);
    } else {
      DenseTensor dist_t;
      dist_t.Resize({m + 1, n + 1});

--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -14,10 +14,10 @@

 #include "paddle/phi/kernels/embedding_grad_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -182,12 +182,12 @@ struct EmbeddingSparseGradCUDAFunctor {
      InputTypeConvert<<<grids, threads, 0, stream>>>(
          ids_data, ids_num, mixv_new_rows.MutableData(gpu_place));
    } else {
-      paddle::memory::Copy(gpu_place,
-                           mixv_new_rows.CUDAMutableData(gpu_place),
-                           gpu_place,
-                           ids_data,
-                           ids_num * sizeof(int64_t),
-                           stream);
+      memory_utils::Copy(gpu_place,
+                         mixv_new_rows.CUDAMutableData(gpu_place),
+                         gpu_place,
+                         ids_data,
+                         ids_num * sizeof(int64_t),
+                         stream);
    }

    mixv_new_rows.CopyToCPU();
@@ -211,12 +211,12 @@ struct EmbeddingSparseGradCUDAFunctor {
                          "output@Grad's shape = [%s].",
                          d_table_value->dims(),
                          d_output_dims_2d));
-    paddle::memory::Copy(gpu_place,
-                         d_table_data,
-                         gpu_place,
-                         d_output_data,
-                         d_output->numel() * sizeof(T),
-                         stream);
+    memory_utils::Copy(gpu_place,
+                       d_table_data,
+                       gpu_place,
+                       d_output_data,
+                       d_output->numel() * sizeof(T),
+                       stream);
  }

 private:

--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_grad_kernel.cu
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -80,12 +80,12 @@ void FillDiagonalTensorGradKernel(const Context &ctx,
    tensor_tmp.Resize(phi::make_ddim({2 + matrows}));
    int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
    const auto gpu_place = ctx.GetPlace();
-    paddle::memory::Copy(gpu_place,
-                         memory_block_cu,
-                         CPUPlace(),
-                         memory_block.data(),
-                         sizeof(int64_t) * (2 + matrows),
-                         stream);
+    memory_utils::Copy(gpu_place,
+                       memory_block_cu,
+                       CPUPlace(),
+                       memory_block.data(),
+                       sizeof(int64_t) * (2 + matrows),
+                       stream);

    int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2];


--- a/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_diagonal_tensor_kernel.cu
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"

@@ -96,12 +96,12 @@ void FillDiagonalTensorKernel(const Context &ctx,
  tensor_tmp.Resize(phi::make_ddim({2 + fill_dims[0]}));
  int64_t *memory_block_cu = ctx.template Alloc<int64_t>(&tensor_tmp);
  const auto gpu_place = ctx.GetPlace();
-  paddle::memory::Copy(gpu_place,
-                       memory_block_cu,
-                       CPUPlace(),
-                       memory_block.data(),
-                       sizeof(int64_t) * (2 + fill_dims[0]),
-                       stream);
+  memory_utils::Copy(gpu_place,
+                     memory_block_cu,
+                     CPUPlace(),
+                     memory_block.data(),
+                     sizeof(int64_t) * (2 + fill_dims[0]),
+                     stream);

  int64_t *strides_cu = &memory_block_cu[0], *matdim_cu = &memory_block_cu[2];


--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -311,12 +311,12 @@ static void NMS(const phi::GPUContext &ctx,
  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);

  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
-  paddle::memory::Copy(CPUPlace(),
-                       mask_host.data(),
-                       place,
-                       mask_dev,
-                       boxes_num * col_blocks * sizeof(uint64_t),
-                       ctx.stream());
+  memory_utils::Copy(CPUPlace(),
+                     mask_host.data(),
+                     place,
+                     mask_dev,
+                     boxes_num * col_blocks * sizeof(uint64_t),
+                     ctx.stream());

  std::vector<int> keep_vec;
  int num_to_keep = 0;
@@ -335,12 +335,12 @@ static void NMS(const phi::GPUContext &ctx,
  }
  keep_out->Resize(phi::make_ddim({num_to_keep}));
  int *keep = ctx.template Alloc<int>(keep_out);
-  paddle::memory::Copy(place,
-                       keep,
-                       CPUPlace(),
-                       keep_vec.data(),
-                       sizeof(int) * num_to_keep,
-                       ctx.stream());
+  memory_utils::Copy(place,
+                     keep,
+                     CPUPlace(),
+                     keep_vec.data(),
+                     sizeof(int) * num_to_keep,
+                     ctx.stream());
  ctx.Wait();
 }

@@ -401,12 +401,12 @@ static std::pair<DenseTensor, DenseTensor> ProposalForOneImage(
                                              pixel_offset);
  int keep_num;
  const auto gpu_place = ctx.GetPlace();
-  paddle::memory::Copy(CPUPlace(),
-                       &keep_num,
-                       gpu_place,
-                       keep_num_t.data<int>(),
-                       sizeof(int),
-                       ctx.stream());
+  memory_utils::Copy(CPUPlace(),
+                     &keep_num,
+                     gpu_place,
+                     keep_num_t.data<int>(),
+                     sizeof(int),
+                     ctx.stream());
  ctx.Wait();
  keep_index.Resize(phi::make_ddim({keep_num}));

@@ -542,18 +542,18 @@ void GenerateProposalsKernel(const Context &ctx,
    DenseTensor &proposals = box_score_pair.first;
    DenseTensor &nscores = box_score_pair.second;

-    paddle::memory::Copy(place,
-                         rpn_rois_data + num_proposals * 4,
-                         place,
-                         proposals.data<T>(),
-                         sizeof(T) * proposals.numel(),
-                         ctx.stream());
-    paddle::memory::Copy(place,
-                         rpn_roi_probs_data + num_proposals,
-                         place,
-                         nscores.data<T>(),
-                         sizeof(T) * nscores.numel(),
-                         ctx.stream());
+    memory_utils::Copy(place,
+                       rpn_rois_data + num_proposals * 4,
+                       place,
+                       proposals.data<T>(),
+                       sizeof(T) * proposals.numel(),
+                       ctx.stream());
+    memory_utils::Copy(place,
+                       rpn_roi_probs_data + num_proposals,
+                       place,
+                       nscores.data<T>(),
+                       sizeof(T) * nscores.numel(),
+                       ctx.stream());
    ctx.Wait();
    num_proposals += proposals.dims()[0];
    offset.emplace_back(num_proposals);
@@ -563,12 +563,12 @@ void GenerateProposalsKernel(const Context &ctx,
    rpn_rois_num->Resize(phi::make_ddim({num}));
    ctx.template Alloc<int>(rpn_rois_num);
    int *num_data = rpn_rois_num->data<int>();
-    paddle::memory::Copy(place,
-                         num_data,
-                         cpu_place,
-                         &tmp_num[0],
-                         sizeof(int) * num,
-                         ctx.stream());
+    memory_utils::Copy(place,
+                       num_data,
+                       cpu_place,
+                       &tmp_num[0],
+                       sizeof(int) * num,
+                       ctx.stream());
    rpn_rois_num->Resize(phi::make_ddim({num}));
  }
  phi::LoD lod;

--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -28,7 +28,6 @@
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"

--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -20,7 +20,6 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -119,12 +118,12 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                          info,
                                                          gesvdj_params));
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,
@@ -199,12 +198,12 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                          gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,
@@ -255,12 +254,12 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
                                                         params));

    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,
@@ -310,12 +309,12 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
                                                         info,
                                                         params));
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,

--- a/paddle/phi/kernels/gpu/mean_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/mean_all_kernel.cu
@@ -14,7 +14,7 @@

 #include "paddle/phi/kernels/mean_all_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
@@ -33,7 +33,7 @@ void MeanAllKernel(const Context& dev_ctx,
  auto stream = dev_ctx.stream();

  if (rank == 0) {  // scalar
-    paddle::memory::Copy(
+    memory_utils::Copy(
        place, out_data, place, in_data, numel * sizeof(T), stream);
    return;
  }

--- a/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_grad_kernel.cu
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/multiplex_grad_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -47,12 +47,12 @@ void MultiplexGradKernel(const Context& ctx,
  for (auto i = 0; i < rows; i++) {
    size_t k = static_cast<size_t>(index[i]);
    if (ins_grad[k]) {
-      paddle::memory::Copy(ctx.GetPlace(),
-                           ins_grad[k]->data<T>() + i * cols,
-                           ctx.GetPlace(),
-                           out_grad.data<T>() + i * cols,
-                           cols * sizeof(T),
-                           stream);
+      memory_utils::Copy(ctx.GetPlace(),
+                         ins_grad[k]->data<T>() + i * cols,
+                         ctx.GetPlace(),
+                         out_grad.data<T>() + i * cols,
+                         cols * sizeof(T),
+                         stream);
    }
  }
 }

--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/multiplex_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"

@@ -50,12 +50,12 @@ void MultiplexKernel(const Context& ctx,
                      ins.size(),
                      errors::PreconditionNotMet(
                          "index exceeds the number of candidate tensors."));
-    paddle::memory::Copy(ctx.GetPlace(),
-                         out->data<T>() + i * cols,
-                         ctx.GetPlace(),
-                         ins[k]->data<T>() + i * cols,
-                         cols * sizeof(T),
-                         stream);
+    memory_utils::Copy(ctx.GetPlace(),
+                       out->data<T>() + i * cols,
+                       ctx.GetPlace(),
+                       ins[k]->data<T>() + i * cols,
+                       cols * sizeof(T),
+                       stream);
  }
 }


--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/nanmedian_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -180,12 +179,12 @@ void ProcessMedianKernel(const Context& dev_ctx,
        phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
    int64_t* nan_stat_cpu_ptr =
        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
-                         nan_stat_cpu_ptr,
-                         dev_ctx.GetPlace(),
-                         nan_stat_mem,
-                         sizeof(int64_t) * 2,
-                         stream);
+    memory_utils::Copy(phi::CPUPlace(),
+                       nan_stat_cpu_ptr,
+                       dev_ctx.GetPlace(),
+                       nan_stat_mem,
+                       sizeof(int64_t) * 2,
+                       stream);

    // all elements are nan values
    T nan_val = std::numeric_limits<T>::quiet_NaN();

--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/nms_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -83,12 +82,12 @@ void NMSKernel(const Context& dev_ctx,
  NMS<T><<<grid, block, 0, dev_ctx.stream()>>>(
      boxes.data<T>(), threshold, num_boxes, mask_dev);
  std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
-  paddle::memory::Copy(phi::CPUPlace(),
-                       mask_host.data(),
-                       dev_ctx.GetPlace(),
-                       mask_dev,
-                       num_boxes * blocks_per_line * sizeof(uint64_t),
-                       dev_ctx.stream());
+  memory_utils::Copy(phi::CPUPlace(),
+                     mask_host.data(),
+                     dev_ctx.GetPlace(),
+                     mask_dev,
+                     num_boxes * blocks_per_line * sizeof(uint64_t),
+                     dev_ctx.stream());
  std::vector<int64_t> remv(blocks_per_line);
  std::vector<int64_t> keep_boxes_idxs(num_boxes);
  int64_t* output_host = keep_boxes_idxs.data();
@@ -106,12 +105,12 @@ void NMSKernel(const Context& dev_ctx,
  }
  output->Resize(phi::make_ddim({last_box_num}));
  auto* output_data = dev_ctx.template Alloc<int64_t>(output);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       output_data,
-                       phi::CPUPlace(),
-                       output_host,
-                       sizeof(int64_t) * last_box_num,
-                       dev_ctx.stream());
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     output_data,
+                     phi::CPUPlace(),
+                     output_host,
+                     sizeof(int64_t) * last_box_num,
+                     dev_ctx.stream());
 }
 }  // namespace phi
 PD_REGISTER_KERNEL(nms, GPU, ALL_LAYOUT, phi::NMSKernel, float, double) {}
--- a/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_grad_kernel.cu
@@ -15,8 +15,8 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -128,12 +128,12 @@ void PsroiPoolGradKernel(const Context& ctx,
    if (rois_num.get_ptr()) {
      rois_batch_size = rois_num->numel();
      std::vector<int> rois_num_list(rois_batch_size);
-      paddle::memory::Copy(CPUPlace(),
-                           rois_num_list.data(),
-                           ctx.GetPlace(),
-                           rois_num->data<int>(),
-                           sizeof(int) * rois_batch_size,
-                           0);
+      memory_utils::Copy(CPUPlace(),
+                         rois_num_list.data(),
+                         ctx.GetPlace(),
+                         rois_num->data<int>(),
+                         sizeof(int) * rois_batch_size,
+                         0);
      int start = 0;
      for (int n = 0; n < rois_batch_size; ++n) {
        for (int i = start; i < start + rois_num_list[n]; ++i) {

--- a/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/psroi_pool_kernel.cu
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -150,12 +150,12 @@ void PsroiPoolKernel(const Context& ctx,
                          rois_batch_size,
                          batch_size));
    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(CPUPlace(),
-                         rois_num_list.data(),
-                         ctx.GetPlace(),
-                         rois_num_data,
-                         sizeof(int) * rois_batch_size,
-                         0);
+    memory_utils::Copy(CPUPlace(),
+                       rois_num_list.data(),
+                       ctx.GetPlace(),
+                       rois_num_data,
+                       sizeof(int) * rois_batch_size,
+                       0);
    int rois_num_count = 0;
    for (int i = 0; i < rois_batch_size; ++i) {
      rois_num_count += rois_num_list[i];

--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -18,9 +18,9 @@
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
@@ -139,12 +139,12 @@ void QrKernel(const Context& ctx,
        auto new_qr_data = ctx.template Alloc<phi::dtype::Real<T>>(&new_qr);
        auto new_qr_stride = m * m;
        for (int i = 0; i < batch_size; ++i) {
-          paddle::memory::Copy(ctx.GetPlace(),
-                               (new_qr_data + i * new_qr_stride),
-                               ctx.GetPlace(),
-                               (qr_data + i * qr_stride),
-                               qr_stride * sizeof(phi::dtype::Real<T>),
-                               ctx.stream());
+          memory_utils::Copy(ctx.GetPlace(),
+                             (new_qr_data + i * new_qr_stride),
+                             ctx.GetPlace(),
+                             (qr_data + i * qr_stride),
+                             qr_stride * sizeof(phi::dtype::Real<T>),
+                             ctx.stream());
        }
        BatchedOrgqr<Context, T>(ctx,
                                 batch_size,
@@ -218,12 +218,12 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,
@@ -272,12 +272,12 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,
@@ -328,12 +328,12 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,
@@ -384,12 +384,12 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
    // Do we need synchronized here?
    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,

--- a/paddle/phi/kernels/gpu/randint_kernel.cu
+++ b/paddle/phi/kernels/gpu/randint_kernel.cu
@@ -21,7 +21,7 @@
 #include "paddle/phi/kernels/funcs/distribution_helper.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -14,8 +14,8 @@

 #pragma once

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"

@@ -287,12 +287,12 @@ void WeightToTensor(const Place &place,
    const T *in_data = weight_list[i]->data<T>();
    auto in_size = weight_list[i]->numel();

-    paddle::memory::Copy(weight->place(),
-                         weight_data + weight_offset,
-                         weight_list[i]->place(),
-                         in_data,
-                         in_size * sizeof(T),
-                         stream);
+    memory_utils::Copy(weight->place(),
+                       weight_data + weight_offset,
+                       weight_list[i]->place(),
+                       in_data,
+                       in_size * sizeof(T),
+                       stream);
    weight_offset += in_size;
  }
 }
@@ -310,12 +310,12 @@ void WeightListToTensor(const Place &place,
  for (size_t i = 0; i < tensor_list.size(); ++i) {
    const T *in_data = tensor_list[i].data<T>();
    auto in_size = tensor_list[i].numel();
-    paddle::memory::Copy(weight_whole->place(),
-                         weight_data + weight_offset,
-                         tensor_list[i].place(),
-                         in_data,
-                         in_size * sizeof(T),
-                         stream);
+    memory_utils::Copy(weight_whole->place(),
+                       weight_data + weight_offset,
+                       tensor_list[i].place(),
+                       in_data,
+                       in_size * sizeof(T),
+                       stream);
    weight_offset += in_size;
  }
 }

--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/roi_align_grad_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -195,12 +194,12 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  if (boxes_num) {
    int boxes_batch_size = boxes_num->numel();
    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(cplace,
-                         boxes_num_list.data(),
-                         gplace,
-                         boxes_num->data<int>(),
-                         sizeof(int) * boxes_batch_size,
-                         0);
+    memory_utils::Copy(cplace,
+                       boxes_num_list.data(),
+                       gplace,
+                       boxes_num->data<int>(),
+                       sizeof(int) * boxes_batch_size,
+                       0);
    int start = 0;
    for (int n = 0; n < boxes_batch_size; ++n) {
      for (size_t i = start; i < start + boxes_num_list[n]; ++i) {
@@ -223,7 +222,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
  int bytes = box_batch_id_list.numel() * sizeof(int);
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, roi_id_data, cplace, box_batch_size, bytes, dev_ctx.stream());
  dev_ctx.template Alloc<T>(dx);


--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/roi_align_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -180,12 +179,12 @@ void RoiAlignKernel(const Context& dev_ctx,
            batch_size));

    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(cplace,
-                         boxes_num_list.data(),
-                         gplace,
-                         boxes_num->data<int>(),
-                         sizeof(int) * boxes_batch_size,
-                         0);
+    memory_utils::Copy(cplace,
+                       boxes_num_list.data(),
+                       gplace,
+                       boxes_num->data<int>(),
+                       sizeof(int) * boxes_batch_size,
+                       0);
    int start = 0;
    for (int n = 0; n < boxes_batch_size; ++n) {
      for (int i = start; i < start + boxes_num_list[n]; ++i) {
@@ -233,7 +232,7 @@ void RoiAlignKernel(const Context& dev_ctx,
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream());
  GPURoiAlignForward<T>
      <<<blocks, threads, 0, dev_ctx.stream()>>>(output_size,

--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/roi_pool_grad_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -98,12 +97,12 @@ void RoiPoolGradKernel(const Context& dev_ctx,
    if (boxes_num) {
      int boxes_batch_size = boxes_num->numel();
      std::vector<int> boxes_num_list(boxes_batch_size);
-      paddle::memory::Copy(phi::CPUPlace(),
-                           boxes_num_list.data(),
-                           gplace,
-                           boxes_num->data<int>(),
-                           sizeof(int) * boxes_batch_size,
-                           0);
+      memory_utils::Copy(phi::CPUPlace(),
+                         boxes_num_list.data(),
+                         gplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * boxes_batch_size,
+                         0);
      int start = 0;
      for (int n = 0; n < boxes_batch_size; ++n) {
        for (int i = start; i < start + boxes_num_list[n]; ++i) {
@@ -126,12 +125,12 @@ void RoiPoolGradKernel(const Context& dev_ctx,
        bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    paddle::memory::Copy(gplace,
-                         roi_id_data,
-                         phi::CPUPlace(),
-                         box_batch_id_data,
-                         bytes,
-                         dev_ctx.stream());
+    memory_utils::Copy(gplace,
+                       roi_id_data,
+                       phi::CPUPlace(),
+                       box_batch_id_data,
+                       bytes,
+                       dev_ctx.stream());

    dev_ctx.template Alloc<T>(dx);
    phi::funcs::SetConstant<Context, T> set_zero;

--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/roi_pool_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
@@ -142,12 +141,12 @@ void RoiPoolKernel(const Context& dev_ctx,
            boxes_batch_size,
            batch_size));
    std::vector<int> boxes_num_list(boxes_batch_size);
-    paddle::memory::Copy(phi::CPUPlace(),
-                         boxes_num_list.data(),
-                         gplace,
-                         boxes_num->data<int>(),
-                         sizeof(int) * boxes_batch_size,
-                         0);
+    memory_utils::Copy(phi::CPUPlace(),
+                       boxes_num_list.data(),
+                       gplace,
+                       boxes_num->data<int>(),
+                       sizeof(int) * boxes_batch_size,
+                       0);
    int start = 0;
    for (int n = 0; n < boxes_batch_size; ++n) {
      for (int i = start; i < start + boxes_num_list[n]; ++i) {
@@ -190,12 +189,12 @@ void RoiPoolKernel(const Context& dev_ctx,
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  int* box_id_data = reinterpret_cast<int*>(box_ptr->ptr());
-  paddle::memory::Copy(gplace,
-                       box_id_data,
-                       phi::CPUPlace(),
-                       box_batch_id_data,
-                       bytes,
-                       dev_ctx.stream());
+  memory_utils::Copy(gplace,
+                     box_id_data,
+                     phi::CPUPlace(),
+                     box_batch_id_data,
+                     bytes,
+                     dev_ctx.stream());

  T* output_data = dev_ctx.template Alloc<T>(out);
  int64_t* arg_max_data = dev_ctx.template Alloc<int64_t>(arg_max);

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -90,12 +90,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
-                         norm_cpu_ptr,
-                         dev_ctx.GetPlace(),
-                         norm,
-                         sizeof(T),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       norm_cpu_ptr,
+                       dev_ctx.GetPlace(),
+                       norm,
+                       sizeof(T),
+                       dev_ctx.stream());
    dev_ctx.Wait();
    auto eps = static_cast<T>(1e-5);
    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -89,12 +89,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
-    paddle::memory::Copy(phi::CPUPlace(),
-                         norm_cpu_ptr,
-                         dev_ctx.GetPlace(),
-                         norm,
-                         sizeof(T),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       norm_cpu_ptr,
+                       dev_ctx.GetPlace(),
+                       norm,
+                       sizeof(T),
+                       dev_ctx.stream());
    dev_ctx.Wait();
    auto eps = static_cast<T>(1e-5);
    *norm_cpu_ptr = *norm_cpu_ptr > eps ? *norm_cpu_ptr : eps;

--- a/paddle/phi/kernels/gpu/svd_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_grad_kernel.cu
@@ -14,7 +14,7 @@

 #include "paddle/phi/kernels/svd_grad_kernel.h"

-#include "paddle/fluid/memory/memory.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"


--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -17,7 +17,6 @@

 #include "paddle/phi/kernels/svd_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -105,12 +104,12 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                               gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,
@@ -186,12 +185,12 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                               gesvdj_params));
    // check the error info
    int error_info;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &error_info,
-                         dev_ctx.GetPlace(),
-                         info,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &error_info,
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        error_info,
        0,

--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -76,7 +76,7 @@ void SyncBatchNormKernel(const Context &ctx,
  const int block = 512;
  int max_threads = ctx.GetMaxPhysicalThreadCount();

-  paddle::memory::AllocationPtr alloc_ptr{nullptr};
+  phi::Allocator::AllocationPtr alloc_ptr{nullptr};

  if (test_mode) {
    mean_data = mean.template data<BatchNormParamType<T>>();

--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -23,9 +23,6 @@
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memory.h"
-
 namespace phi {

 template <typename T, typename Context>
@@ -98,12 +95,12 @@ void TriangularSolveKernel(const Context& dev_ctx,
        cpu_ptrs.size() * sizeof(T*),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         tmp_gpu_ptrs_data->ptr(),
-                         paddle::platform::CPUPlace(),
-                         static_cast<void*>(cpu_ptrs.data()),
-                         cpu_ptrs.size() * sizeof(T*),
-                         dev_ctx.stream());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       tmp_gpu_ptrs_data->ptr(),
+                       paddle::platform::CPUPlace(),
+                       static_cast<void*>(cpu_ptrs.data()),
+                       cpu_ptrs.size() * sizeof(T*),
+                       dev_ctx.stream());

    const T** gpu_a_ptrs =
        reinterpret_cast<const T**>(tmp_gpu_ptrs_data->ptr());

--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/yolo_box_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/yolo_box_util.h"
@@ -133,7 +133,7 @@ void YoloBoxKernel(const Context& dev_ctx,
  int* anchors_data = dev_ctx.template Alloc<int>(&tmp_anchors);
  const auto gplace = dev_ctx.GetPlace();
  const auto cplace = phi::CPUPlace();
-  paddle::memory::Copy(
+  memory_utils::Copy(
      gplace, anchors_data, cplace, anchors.data(), bytes, dev_ctx.stream());

  const T* input_data = input->data<T>();

--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -49,9 +49,9 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
    int device_id = phi::backends::gpu::GetCurrentDeviceId();
    int64_t allocated =
-        paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+        memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
    int64_t reserved =
-        paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
+        memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)

--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -23,7 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"

 // TODO(xiongkun): remove the header when decouple the memcpy function in phi.
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {
 using Tensor = DenseTensor;
@@ -58,7 +58,7 @@ struct GetTensorValue<phi::GPUContext, T> {
    const T* data = tensor.data<T>();
    T value;
    const auto gpu_place = dev_ctx.GetPlace();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        phi::CPUPlace(), &value, gpu_place, data, sizeof(T), dev_ctx.stream());
    return value;
  }

--- a/paddle/phi/kernels/impl/lstsq_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lstsq_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"

@@ -153,12 +153,12 @@ inline void BatchedOrmqr<GPUContext, float>(const GPUContext& dev_ctx,

    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,
@@ -222,12 +222,12 @@ inline void BatchedOrmqr<GPUContext, double>(const GPUContext& dev_ctx,

    // check the error info
    int info_h;
-    paddle::memory::Copy(phi::CPUPlace(),
-                         &info_h,
-                         dev_ctx.GetPlace(),
-                         info_d,
-                         sizeof(int),
-                         dev_ctx.stream());
+    memory_utils::Copy(phi::CPUPlace(),
+                       &info_h,
+                       dev_ctx.GetPlace(),
+                       info_d,
+                       sizeof(int),
+                       dev_ctx.stream());
    PADDLE_ENFORCE_EQ(
        info_h,
        0,

--- a/paddle/phi/kernels/impl/qr_kernel_impl.h
+++ b/paddle/phi/kernels/impl/qr_kernel_impl.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/optional.h"

--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -62,7 +62,7 @@ void MemcpyD2HKernel(const Context& dev_ctx,

    case 1:
      Copy(dev_ctx, x, GPUPinnedPlace(), false, out);
-      // paddle::memory::Copy use async copy for GPUPinnedPlace
+      // Copy use async copy for GPUPinnedPlace
      dev_ctx.Wait();
      break;


--- a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
@@ -71,22 +71,22 @@ void AdamDenseParamSparseGradKernel(
    if (beta1_pow.dtype() == DataType::FLOAT16) {
      XPUType* beta1_pow_t =
          RAII_GUARD.alloc_l3_or_gm<XPUType>(beta1_pow.numel());
-      paddle::memory::Copy(param.place(),
-                           beta1_pow_t,
-                           beta1_pow.place(),
-                           beta1_pow.data<T>(),
-                           sizeof(T) * beta1_pow.numel());
+      memory_utils::Copy(param.place(),
+                         beta1_pow_t,
+                         beta1_pow.place(),
+                         beta1_pow.data<T>(),
+                         sizeof(T) * beta1_pow.numel());

      int r = xpu::cast<XPUType, float>(
          dev_ctx.x_context(), beta1_pow_t, beta1_pow_ptr, beta1_pow.numel());
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
    } else {
      beta1_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta1_pow.numel());
-      paddle::memory::Copy(param.place(),
-                           beta1_pow_ptr,
-                           beta1_pow.place(),
-                           beta1_pow.data<T>(),
-                           sizeof(T) * beta1_pow.numel());
+      memory_utils::Copy(param.place(),
+                         beta1_pow_ptr,
+                         beta1_pow.place(),
+                         beta1_pow.data<T>(),
+                         sizeof(T) * beta1_pow.numel());
    }

  } else {
@@ -103,22 +103,22 @@ void AdamDenseParamSparseGradKernel(
    if (beta2_pow.dtype() == DataType::FLOAT16) {
      XPUType* beta2_pow_t =
          RAII_GUARD.alloc_l3_or_gm<XPUType>(beta2_pow.numel());
-      paddle::memory::Copy(param.place(),
-                           beta2_pow_t,
-                           beta2_pow.place(),
-                           beta2_pow.data<T>(),
-                           sizeof(T) * beta2_pow.numel());
+      memory_utils::Copy(param.place(),
+                         beta2_pow_t,
+                         beta2_pow.place(),
+                         beta2_pow.data<T>(),
+                         sizeof(T) * beta2_pow.numel());

      int r = xpu::cast<XPUType, float>(
          dev_ctx.x_context(), beta2_pow_t, beta2_pow_ptr, beta2_pow.numel());
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
    } else {
      beta2_pow_ptr = RAII_GUARD.alloc_l3_or_gm<float>(beta2_pow.numel());
-      paddle::memory::Copy(param.place(),
-                           beta2_pow_ptr,
-                           beta2_pow.place(),
-                           beta2_pow.data<T>(),
-                           sizeof(T) * beta2_pow.numel());
+      memory_utils::Copy(param.place(),
+                         beta2_pow_ptr,
+                         beta2_pow.place(),
+                         beta2_pow.data<T>(),
+                         sizeof(T) * beta2_pow.numel());
    }
  } else {
    if (beta2_pow.dtype() == DataType::FLOAT16)
@@ -233,11 +233,11 @@ void AdamDenseParamSparseGradKernel(
    rows[i] = static_cast<int>(merge_rows[i]);
  }
  xpu_wait(dev_ctx.x_context()->xpu_stream);
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       xpu_rows,
-                       CPUPlace(),
-                       rows.data(),
-                       row_count * sizeof(int));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     xpu_rows,
+                     CPUPlace(),
+                     rows.data(),
+                     row_count * sizeof(int));
  auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
  auto ori_rows = param.numel() / row_numel;


--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -15,11 +15,10 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_kernel.h"

 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"

-#include "paddle/fluid/memory/memory.h"
-
 namespace phi {

 template <typename T, typename Context, typename Functor>
@@ -207,11 +206,11 @@ void PowKernel(const Context& dev_ctx,
  T* factor_data = RAII_GUARD.alloc_l3_or_gm<T>(1);
  PADDLE_ENFORCE_NOT_NULL(
      factor_data, errors::External("XPU alloc_l3_or_gm returns nullptr"));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       static_cast<void*>(factor_data),
-                       phi::CPUPlace(),
-                       static_cast<void*>(&pow_factor),
-                       sizeof(T));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     static_cast<void*>(factor_data),
+                     phi::CPUPlace(),
+                     static_cast<void*>(&pow_factor),
+                     sizeof(T));

  auto x_dims = vectorize<int>(x.dims());
  // use [1] to replace [], because xpu not support []

--- a/paddle/phi/kernels/xpu/amp_kernel.cc
+++ b/paddle/phi/kernels/xpu/amp_kernel.cc
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <string>
 #include <vector>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -53,11 +53,11 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  const bool* found_inf_data = found_infinite.data<bool>();
  bool cpu_found_inf_data = false;
  if (found_infinite.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
-                         static_cast<void*>(&cpu_found_inf_data),
-                         found_infinite.place(),
-                         static_cast<const void*>(found_inf_data),
-                         sizeof(bool));
+    memory_utils::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&cpu_found_inf_data),
+                       found_infinite.place(),
+                       static_cast<const void*>(found_inf_data),
+                       sizeof(bool));
  } else {
    cpu_found_inf_data = (*found_inf_data);
  }
@@ -93,31 +93,31 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
  int cpu_good_in_data;
  MPDType cpu_pre_loss_scaling_data;
  if (in_bad_steps.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
-                         static_cast<void*>(&cpu_bad_in_data),
-                         in_bad_steps.place(),
-                         static_cast<const void*>(bad_in_data),
-                         sizeof(int));
+    memory_utils::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&cpu_bad_in_data),
+                       in_bad_steps.place(),
+                       static_cast<const void*>(bad_in_data),
+                       sizeof(int));
  } else {
    cpu_bad_in_data = (*bad_in_data);
  }

  if (in_good_steps.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
-                         static_cast<void*>(&cpu_good_in_data),
-                         in_good_steps.place(),
-                         static_cast<const void*>(good_in_data),
-                         sizeof(int));
+    memory_utils::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&cpu_good_in_data),
+                       in_good_steps.place(),
+                       static_cast<const void*>(good_in_data),
+                       sizeof(int));
  } else {
    cpu_good_in_data = (*good_in_data);
  }

  if (prev_loss_scaling.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
-                         static_cast<void*>(&cpu_pre_loss_scaling_data),
-                         prev_loss_scaling.place(),
-                         static_cast<const void*>(pre_loss_scaling_data),
-                         sizeof(MPDType));
+    memory_utils::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&cpu_pre_loss_scaling_data),
+                       prev_loss_scaling.place(),
+                       static_cast<const void*>(pre_loss_scaling_data),
+                       sizeof(MPDType));
  } else {
    cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
  }
@@ -148,21 +148,21 @@ void UpdateLossScalingKernel(const Context& dev_ctx,
    }
  }
  // copy to device
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       bad_out_data,
-                       phi::CPUPlace(),
-                       &cpu_bad_out_data,
-                       sizeof(int));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       good_out_data,
-                       phi::CPUPlace(),
-                       &cpu_good_out_data,
-                       sizeof(int));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       updated_loss_scaling_data,
-                       phi::CPUPlace(),
-                       &cpu_updated_loss_scaling_data,
-                       sizeof(MPDType));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     bad_out_data,
+                     phi::CPUPlace(),
+                     &cpu_bad_out_data,
+                     sizeof(int));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     good_out_data,
+                     phi::CPUPlace(),
+                     &cpu_good_out_data,
+                     sizeof(int));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     updated_loss_scaling_data,
+                     phi::CPUPlace(),
+                     &cpu_updated_loss_scaling_data,
+                     sizeof(MPDType));
 }

 template <typename T, typename Context>
@@ -185,11 +185,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
  int nums_inf_nans = 0;
  MPDType cpu_scale_data;
  if (scale.place().GetType() == phi::AllocationType::XPU) {
-    paddle::memory::Copy(phi::CPUPlace(),
-                         static_cast<void*>(&cpu_scale_data),
-                         scale.place(),
-                         static_cast<const void*>(scale_data),
-                         sizeof(MPDType));
+    memory_utils::Copy(phi::CPUPlace(),
+                       static_cast<void*>(&cpu_scale_data),
+                       scale.place(),
+                       static_cast<const void*>(scale_data),
+                       sizeof(MPDType));

  } else {
    cpu_scale_data = (*scale_data);
@@ -211,11 +211,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                                inf_nan_count.data<int>(),
                                x->numel());
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "count_nan_or_inf");
-      paddle::memory::Copy(phi::CPUPlace(),
-                           &nums_inf_nans,
-                           dev_ctx.GetPlace(),
-                           inf_nan_count.data<int>(),
-                           sizeof(int));
+      memory_utils::Copy(phi::CPUPlace(),
+                         &nums_inf_nans,
+                         dev_ctx.GetPlace(),
+                         inf_nan_count.data<int>(),
+                         sizeof(int));
    }

    if (nums_inf_nans > 0) {
@@ -264,11 +264,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
    }
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       found_inf_data,
-                       phi::CPUPlace(),
-                       &cpu_found_inf_data,
-                       sizeof(bool));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     found_inf_data,
+                     phi::CPUPlace(),
+                     &cpu_found_inf_data,
+                     sizeof(bool));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/dropout_kernel.cc
+++ b/paddle/phi/kernels/xpu/dropout_kernel.cc
@@ -17,8 +17,8 @@
 #include <memory>
 #include <string>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -46,11 +46,11 @@ void DropoutRawKernel(const Context& dev_ctx,
    int seed_data = 0;
    if (seed_tensor.get_ptr() != nullptr) {
      if ((seed_tensor->place()).GetType() == phi::AllocationType::XPU) {
-        paddle::memory::Copy(phi::CPUPlace(),
-                             &seed_data,
-                             seed_tensor->place(),
-                             seed_tensor->data<int>(),
-                             sizeof(int));
+        memory_utils::Copy(phi::CPUPlace(),
+                           &seed_data,
+                           seed_tensor->place(),
+                           seed_tensor->data<int>(),
+                           sizeof(int));
      } else {
        seed_data = *(seed_tensor->data<int>());
      }

--- a/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/embedding_grad_kernel.cc
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/embedding_grad_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"

@@ -99,11 +99,11 @@ void EmbeddingSparseGradKernel(const Context& ctx,
    int r = xpu::cast<int32_t, int64_t>(
        ctx.x_context(), input.data<int>(), id_t, input.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-    paddle::memory::Copy(CPUPlace(),
-                         ids_cpu.data(),
-                         input.place(),
-                         id_t,
-                         sizeof(int64_t) * input.numel());
+    memory_utils::Copy(CPUPlace(),
+                       ids_cpu.data(),
+                       input.place(),
+                       id_t,
+                       sizeof(int64_t) * input.numel());
    ids = CopyIdsToVector<int, int64_t>(ids_cpu);
  } else {
    PADDLE_THROW(phi::errors::Unimplemented(
@@ -140,11 +140,11 @@ void EmbeddingSparseGradKernel(const Context& ctx,
                        d_table_value->dims(),
                        d_output_dims_2d));

-  paddle::memory::Copy(CPUPlace(),
-                       d_table_data,
-                       xpu_place,
-                       d_output_data,
-                       d_output->numel() * sizeof(T));
+  memory_utils::Copy(CPUPlace(),
+                     d_table_data,
+                     xpu_place,
+                     d_output_data,
+                     d_output->numel() * sizeof(T));
 }
 }  // namespace phi


--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -24,7 +24,7 @@
 #include "paddle/phi/core/visit_type.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {


--- a/paddle/phi/kernels/xpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/gaussian_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"

@@ -48,11 +48,11 @@ void GaussianKernel(const Context& ctx,
  for (int64_t i = 0; i < size; ++i) {
    data_cpu[i] = dist(*engine);
  }
-  paddle::memory::Copy(ctx.GetPlace(),
-                       data,
-                       phi::CPUPlace(),
-                       reinterpret_cast<void*>(data_cpu.get()),
-                       size * sizeof(T));
+  memory_utils::Copy(ctx.GetPlace(),
+                     data,
+                     phi::CPUPlace(),
+                     reinterpret_cast<void*>(data_cpu.get()),
+                     size * sizeof(T));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
+++ b/paddle/phi/kernels/xpu/generate_proposals_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function_impl.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -37,11 +37,11 @@ static void SortDescending(const XPUContext& dev_ctx,
  scores_slice_cpu.Resize({value.numel()});
  T* scores_slice_cpu_data = dev_ctx.template HostAlloc<T>(&scores_slice_cpu);

-  paddle::memory::Copy(cpu_place,
-                       scores_slice_cpu_data,
-                       place,
-                       value_data,
-                       sizeof(T) * value.numel());
+  memory_utils::Copy(cpu_place,
+                     scores_slice_cpu_data,
+                     place,
+                     value_data,
+                     sizeof(T) * value.numel());
  // Sort index
  DenseTensor index_t;
  index_t.Resize({value.numel()});
@@ -65,7 +65,7 @@ static void SortDescending(const XPUContext& dev_ctx,

  index_out->Resize({index_t.numel()});
  int* idx_out = dev_ctx.template Alloc<int>(index_out);
-  paddle::memory::Copy(
+  memory_utils::Copy(
      place, idx_out, cpu_place, index, sizeof(T) * index_t.numel());
 }

@@ -180,11 +180,11 @@ std::pair<DenseTensor, DenseTensor> ProposalForOneImage(

  int keep_num;
  const auto xpu_place = dev_ctx.GetPlace();
-  paddle::memory::Copy(phi::CPUPlace(),
-                       &keep_num,
-                       xpu_place,
-                       keep_num_t.data<int>(),
-                       sizeof(int));
+  memory_utils::Copy(phi::CPUPlace(),
+                     &keep_num,
+                     xpu_place,
+                     keep_num_t.data<int>(),
+                     sizeof(int));
  keep_index.Resize({keep_num});

  DenseTensor scores_filter, proposals_filter;
@@ -395,7 +395,7 @@ void GenerateProposalsKernel(const Context& dev_ctx,
    rpn_rois_num->Resize(phi::make_ddim({num}));
    dev_ctx.template Alloc<int>(rpn_rois_num);
    int* num_data = rpn_rois_num->data<int>();
-    paddle::memory::Copy(
+    memory_utils::Copy(
        place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num);
  }


--- a/paddle/phi/kernels/xpu/lamb_kernel.cc
+++ b/paddle/phi/kernels/xpu/lamb_kernel.cc
@@ -14,10 +14,10 @@

 #include "paddle/phi/kernels/lamb_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"

@@ -61,11 +61,11 @@ void LambKernel(const Context& dev_ctx,
      cpu_skip_update = *(skip_update->data<bool>());
    } else {
      const bool* skip_update_flag = skip_update->data<bool>();
-      paddle::memory::Copy(phi::CPUPlace(),
-                           static_cast<void*>(&cpu_skip_update),
-                           dev_ctx.GetPlace(),
-                           static_cast<const void*>(skip_update_flag),
-                           sizeof(bool));
+      memory_utils::Copy(phi::CPUPlace(),
+                         static_cast<void*>(&cpu_skip_update),
+                         dev_ctx.GetPlace(),
+                         static_cast<const void*>(skip_update_flag),
+                         sizeof(bool));
    }
  }
  if (cpu_skip_update) {
@@ -114,11 +114,11 @@ void LambKernel(const Context& dev_ctx,
    int r = xpu_malloc(reinterpret_cast<void**>(&beta1_pow_xpu_ptr),
                       (beta1_pow.numel()) * sizeof(MT));
    PADDLE_ENFORCE_XPU_SUCCESS(r);
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         beta1_pow_xpu_ptr,
-                         beta1_pow.place(),
-                         beta1_pow.data<MT>(),
-                         sizeof(MT) * beta1_pow.numel());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       beta1_pow_xpu_ptr,
+                       beta1_pow.place(),
+                       beta1_pow.data<MT>(),
+                       sizeof(MT) * beta1_pow.numel());
    beta1_pow_ptr = beta1_pow_xpu_ptr;
    beta1_pow_out_ptr = RAII_GUARD.alloc_l3_or_gm<MT>(beta1_pow_out->numel());
    PADDLE_ENFORCE_XDNN_NOT_NULL(beta1_pow_out_ptr);
@@ -130,11 +130,11 @@ void LambKernel(const Context& dev_ctx,
    int r = xpu_malloc(reinterpret_cast<void**>(&beta2_pow_xpu_ptr),
                       (beta2_pow.numel()) * sizeof(MT));
    PADDLE_ENFORCE_XPU_SUCCESS(r);
-    paddle::memory::Copy(dev_ctx.GetPlace(),
-                         beta2_pow_xpu_ptr,
-                         beta2_pow.place(),
-                         beta2_pow.data<MT>(),
-                         sizeof(MT) * beta2_pow.numel());
+    memory_utils::Copy(dev_ctx.GetPlace(),
+                       beta2_pow_xpu_ptr,
+                       beta2_pow.place(),
+                       beta2_pow.data<MT>(),
+                       sizeof(MT) * beta2_pow.numel());
    beta2_pow_ptr = beta2_pow_xpu_ptr;

    beta2_pow_out_ptr = RAII_GUARD.alloc_l3_or_gm<MT>(beta2_pow_out->numel());
@@ -198,22 +198,22 @@ void LambKernel(const Context& dev_ctx,

  if (beta1_pow.place().GetType() == phi::AllocationType::CPU) {
    // copy beta1_pow_out from xpu to cpu
-    paddle::memory::Copy(beta1_pow.place(),
-                         dev_ctx.template HostAlloc<MT>(beta1_pow_out),
-                         dev_ctx.GetPlace(),
-                         beta1_pow_out_ptr,
-                         sizeof(MT) * beta1_pow_out->numel());
+    memory_utils::Copy(beta1_pow.place(),
+                       dev_ctx.template HostAlloc<MT>(beta1_pow_out),
+                       dev_ctx.GetPlace(),
+                       beta1_pow_out_ptr,
+                       sizeof(MT) * beta1_pow_out->numel());
    if (beta1_pow_xpu_ptr) {
      xpu_free(beta1_pow_xpu_ptr);
    }
  }
  if (beta2_pow.place().GetType() == phi::AllocationType::CPU) {
    // copy beta2_pow_out from xpu to cpu
-    paddle::memory::Copy(beta2_pow.place(),
-                         dev_ctx.template HostAlloc<MT>(beta2_pow_out),
-                         dev_ctx.GetPlace(),
-                         beta2_pow_out_ptr,
-                         sizeof(MT) * beta2_pow_out->numel());
+    memory_utils::Copy(beta2_pow.place(),
+                       dev_ctx.template HostAlloc<MT>(beta2_pow_out),
+                       dev_ctx.GetPlace(),
+                       beta2_pow_out_ptr,
+                       sizeof(MT) * beta2_pow_out->numel());
    if (beta2_pow_xpu_ptr) {
      xpu_free(beta2_pow_xpu_ptr);
    }

--- a/paddle/phi/kernels/xpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/xpu/masked_select_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -49,11 +49,11 @@ void MaskedSelectKernel(const Context& dev_ctx,
      xpu::nonzero_count(
          dev_ctx.x_context(), mask_data, out_size, mask.numel()),
      "nonzero_count ");
-  paddle::memory::Copy(phi::CPUPlace(),
-                       static_cast<void*>(&out_size_cpu),
-                       mask.place(),
-                       static_cast<void*>(out_size),
-                       sizeof(int32_t));
+  memory_utils::Copy(phi::CPUPlace(),
+                     static_cast<void*>(&out_size_cpu),
+                     mask.place(),
+                     static_cast<void*>(out_size),
+                     sizeof(int32_t));

  DDim out_dim{out_size_cpu};
  out->Resize(out_dim);

--- a/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/mean_all_grad_kernel.cc
@@ -14,8 +14,8 @@

 #include "paddle/phi/kernels/mean_all_grad_kernel.h"

-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -40,7 +40,7 @@ void MeanAllGradKernel(const Context& dev_ctx,
  const T* dy = OG->data<T>();
  T dy0_value;
  xpu_wait(dev_ctx.x_context()->xpu_stream);
-  paddle::memory::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
+  memory_utils::Copy(phi::CPUPlace(), &dy0_value, OG->place(), dy, sizeof(T));
  float dy0_fp32 = static_cast<float>(dy0_value);
  dy0_fp32 = dy0_fp32 / static_cast<float>(IG->numel());


--- a/paddle/phi/kernels/xpu/nonzero_kernel.cc
+++ b/paddle/phi/kernels/xpu/nonzero_kernel.cc
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/nonzero_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -42,11 +42,11 @@ void NonZeroKernel(const Context& dev_ctx,
          ret,
          XPUAPIErrorMsg[ret]));

-  paddle::memory::Copy(phi::CPUPlace(),
-                       static_cast<void*>(&true_num_cpu),
-                       dev_ctx.GetPlace(),
-                       static_cast<void*>(true_num),
-                       sizeof(int32_t));
+  memory_utils::Copy(phi::CPUPlace(),
+                     static_cast<void*>(&true_num_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<void*>(true_num),
+                     sizeof(int32_t));

  out->Resize(phi::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
  auto* out_data = dev_ctx.template Alloc<int64_t>(out);

--- a/paddle/phi/kernels/xpu/randint_kernel.cc
+++ b/paddle/phi/kernels/xpu/randint_kernel.cc
@@ -16,8 +16,8 @@

 #include <random>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"

@@ -47,11 +47,11 @@ void RandintRawKernel(const Context& dev_ctx,
  for (int64_t i = 0; i < numel; ++i) {
    data_cpu[i] = dist(*engine);
  }
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       data,
-                       phi::CPUPlace(),
-                       reinterpret_cast<void*>(data_cpu.get()),
-                       size * sizeof(T));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     data,
+                     phi::CPUPlace(),
+                     reinterpret_cast<void*>(data_cpu.get()),
+                     size * sizeof(T));
 }

 template <typename T, typename Context>

--- a/paddle/phi/kernels/xpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/xpu/rmsprop_kernel.cc
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -48,11 +48,11 @@ void RmspropDenseKernel(const Context& dev_ctx,
                              " But received learning rate dim [%s] ",
                              learning_rate.dims().size()));
  T learning_rate_cpu = 0.0f;
-  paddle::memory::Copy(CPUPlace(),
-                       static_cast<void*>(&learning_rate_cpu),
-                       dev_ctx.GetPlace(),
-                       static_cast<const void*>(learning_rate.data()),
-                       sizeof(T));
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(&learning_rate_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<const void*>(learning_rate.data()),
+                     sizeof(T));

  // alloc output
  dev_ctx.template Alloc<T>(param_out);

--- a/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/roi_align_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -51,11 +51,11 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  if (boxes_num) {
    rois_batch_size = boxes_num->numel();
    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(cplace,
-                         rois_num_list.data(),
-                         xplace,
-                         boxes_num->data<int>(),
-                         sizeof(int) * rois_batch_size);
+    memory_utils::Copy(cplace,
+                       rois_num_list.data(),
+                       xplace,
+                       boxes_num->data<int>(),
+                       sizeof(int) * rois_batch_size);
    cpu_lod = new int[rois_batch_size + 1];
    cpu_lod[0] = 0;
    for (int i = 0; i < rois_batch_size; i++) {
@@ -73,11 +73,11 @@ void RoiAlignGradKernel(const Context& dev_ctx,
  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
                     (rois_batch_size + 1) * sizeof(int));
  PADDLE_ENFORCE_XPU_SUCCESS(r);
-  paddle::memory::Copy(xplace,
-                       roi_id_data,
-                       cplace,
-                       cpu_lod,
-                       (rois_batch_size + 1) * sizeof(int));
+  memory_utils::Copy(xplace,
+                     roi_id_data,
+                     cplace,
+                     cpu_lod,
+                     (rois_batch_size + 1) * sizeof(int));
  dev_ctx.template Alloc<T>(dx);

  int output_grad_size = out_grad.numel();

--- a/paddle/phi/kernels/xpu/roi_align_kernel.cc
+++ b/paddle/phi/kernels/xpu/roi_align_kernel.cc
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/roi_align_kernel.h"

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -62,11 +62,11 @@ void RoiAlignKernel(const Context& dev_ctx,
            batch_size));

    std::vector<int> rois_num_list(rois_batch_size);
-    paddle::memory::Copy(cplace,
-                         rois_num_list.data(),
-                         xplace,
-                         boxes_num->data<int>(),
-                         sizeof(int) * rois_batch_size);
+    memory_utils::Copy(cplace,
+                       rois_num_list.data(),
+                       xplace,
+                       boxes_num->data<int>(),
+                       sizeof(int) * rois_batch_size);
    cpu_lod = new int[rois_batch_size + 1];
    cpu_lod[0] = 0;
    for (int i = 0; i < rois_batch_size; i++) {
@@ -115,11 +115,11 @@ void RoiAlignKernel(const Context& dev_ctx,
  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
                     (rois_batch_size + 1) * sizeof(int));
  PADDLE_ENFORCE_XPU_SUCCESS(r);
-  paddle::memory::Copy(xplace,
-                       roi_id_data,
-                       cplace,
-                       cpu_lod,
-                       (rois_batch_size + 1) * sizeof(int));
+  memory_utils::Copy(xplace,
+                     roi_id_data,
+                     cplace,
+                     cpu_lod,
+                     (rois_batch_size + 1) * sizeof(int));
  delete[] cpu_lod;
  r = xpu::roi_align<T, int>(dev_ctx.x_context(),
                             x.data<T>(),

--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_grad_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -66,11 +66,11 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context& dev_ctx,
                               x.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
    int non_zero_cpu = 0;
-    paddle::memory::Copy(CPUPlace(),
-                         static_cast<void*>(&non_zero_cpu),
-                         dev_ctx.GetPlace(),
-                         static_cast<void*>(non_zero),
-                         sizeof(int));
+    memory_utils::Copy(CPUPlace(),
+                       static_cast<void*>(&non_zero_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(non_zero),
+                       sizeof(int));
    r = xpu::scale(dev_ctx.x_context(),
                   reinterpret_cast<const XPUType*>(in_grad->data<T>()),
                   reinterpret_cast<XPUType*>(in_grad->data<T>()),

--- a/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
+++ b/paddle/phi/kernels/xpu/sigmoid_cross_entropy_with_logits_kernel.cc
@@ -20,7 +20,7 @@
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"

-#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/common/memory_utils.h"

 namespace phi {

@@ -62,11 +62,11 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context& dev_ctx,
                               x.numel());
    PADDLE_ENFORCE_XDNN_SUCCESS(r, "nonzero_count");
    int non_zero_cpu = 0;
-    paddle::memory::Copy(CPUPlace(),
-                         static_cast<void*>(&non_zero_cpu),
-                         dev_ctx.GetPlace(),
-                         static_cast<void*>(non_zero),
-                         sizeof(int));
+    memory_utils::Copy(CPUPlace(),
+                       static_cast<void*>(&non_zero_cpu),
+                       dev_ctx.GetPlace(),
+                       static_cast<void*>(non_zero),
+                       sizeof(int));

    r = xpu::scale(dev_ctx.x_context(),
                   reinterpret_cast<const XPUType*>(out->data<T>()),

--- a/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/xpu/truncated_gaussian_random_kernel.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <limits>
 #include <random>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/truncated_normal.h"

@@ -52,11 +52,11 @@ void TruncatedGaussianRandomKernel(const Context& dev_ctx,
    data_cpu[i] = truncated_normal(dist(*engine));
  }

-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       data,
-                       phi::CPUPlace(),
-                       reinterpret_cast<void*>(data_cpu.get()),
-                       size * sizeof(T));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     data,
+                     phi::CPUPlace(),
+                     reinterpret_cast<void*>(data_cpu.get()),
+                     size * sizeof(T));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/xpu/uniform_kernel.cc
@@ -16,8 +16,8 @@ limitations under the License. */

 #include <string>

-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/uniform_real_distribution.h"

@@ -67,11 +67,11 @@ void UniformRawKernel(const Context &dev_ctx,
    }
  }

-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       data,
-                       phi::CPUPlace(),
-                       reinterpret_cast<void *>(data_cpu.get()),
-                       size * sizeof(T));
+  memory_utils::Copy(dev_ctx.GetPlace(),
+                     data,
+                     phi::CPUPlace(),
+                     reinterpret_cast<void *>(data_cpu.get()),
+                     size * sizeof(T));
 }

 }  // namespace phi

--- a/paddle/phi/tests/common/transform_test.cu
+++ b/paddle/phi/tests/common/transform_test.cu
@@ -16,9 +16,8 @@ limitations under the License. */

 #include "paddle/phi/common/transform.h"

-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/hostdevice.h"

 template <typename T>
@@ -37,9 +36,6 @@ class Multiply {
  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };

-using paddle::memory::Alloc;
-using paddle::memory::Copy;
-
 using paddle::platform::CPUPlace;
 using paddle::platform::CUDAPlace;
 using phi::CPUContext;
@@ -63,13 +59,15 @@ TEST(Transform, GPUUnary) {
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
-  auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
+  auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(float) * 4);
  float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
+  phi::memory_utils::Copy(
+      gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx->stream());
  Transform<phi::GPUContext> trans;
  trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
  ctx->Wait();
-  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
+  phi::memory_utils::Copy(
+      CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx->stream());
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
  }
@@ -91,13 +89,15 @@ TEST(Transform, GPUBinary) {
  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

-  auto gpu_allocation = Alloc(gpu0, sizeof(buf));
+  auto gpu_allocation = phi::memory_utils::Alloc(gpu0, sizeof(buf));
  int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
-  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
+  phi::memory_utils::Copy(
+      gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx->stream());
  Transform<phi::GPUContext> trans;
  trans(*ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
  ctx->Wait();
-  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
+  phi::memory_utils::Copy(
+      CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx->stream());
  for (int i = 0; i < 4; ++i) {
    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
  }

--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"

 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 namespace phi {
@@ -96,7 +95,7 @@ TEST(StridedMemcpy, GPUCrop) {
  auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));

  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());

  phi::DDim src_stride({5, 1});

@@ -110,7 +109,7 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::funcs::StridedMemcpy<int>(
      *ctx, gpu_src + 1, src_stride, dst_dim, dst_stride, gpu_dst);

-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
  ctx->Wait();

  ASSERT_EQ(1, dst[0]);
@@ -135,7 +134,7 @@ TEST(StridedMemcpy, GPUConcat) {

  auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
-  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
+  memory_utils::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());

  int dst[8];
  auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
@@ -150,7 +149,7 @@ TEST(StridedMemcpy, GPUConcat) {
  phi::funcs::StridedMemcpy<int>(
      *ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst + 2);

-  paddle::memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
+  memory_utils::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx->stream());
  ctx->Wait();

  // clang-format off