[PHI Decoupling]Remove memory header (Part1) (#50419)

* decouple_memory * perfect memory utils * fix ci bugs * fix inference bugs * fix custom test bugs * fix converage bugs * modify code according comment * modify namespace * deal with compile bugs

[PHI Decoupling]Remove memory header (Part1) (#50419)
* decouple_memory * perfect memory utils * fix ci bugs * fix inference bugs * fix custom test bugs * fix converage bugs * modify code according comment * modify namespace * deal with compile bugs
1cfcb71d · YuanRisheng · GitHub · 7a156f18 · 1cfcb71d · 1cfcb71d
71 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -524,6 +524,7 @@ bool AnalysisPredictor::PrepareScope(
    status_is_cloned_ = true;
  } else {
    paddle::framework::InitDevices();
+    paddle::framework::InitMemoryMethod();
    paddle::framework::InitDefaultKernelSignatureMap();
    // TODO(wilber): we need to release memory occupied by weights.
    scope_.reset(new paddle::framework::Scope());

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -95,6 +95,7 @@ bool NativePaddlePredictor::Init(
                                "The sub_scope should not be nullptr."));
  } else {
    paddle::framework::InitDevices();
+    paddle::framework::InitMemoryMethod();
    paddle::framework::InitDefaultKernelSignatureMap();
    scope_.reset(new paddle::framework::Scope());
  }

--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -187,7 +187,7 @@ bool ONNXRuntimePredictor::Init() {
  session_ = std::make_shared<Ort::Session>(
      *env_, onnx_proto, static_cast<size_t>(out_size), session_options);
  InitBinding();
-
+  paddle::framework::InitMemoryMethod();
  delete onnx_proto;
  onnx_proto = nullptr;
  return true;

--- a/paddle/fluid/platform/device/custom/CMakeLists.txt
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -3,4 +3,8 @@ if(WITH_CUSTOM_DEVICE)
    custom_device_resource_pool
    SRCS custom_device_resource_pool.cc
    DEPS gflags glog enforce monitor)
+  cc_test(
+    custom_device_test
+    SRCS custom_device_test.cc
+    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
 endif()
--- a/paddle/phi/backends/custom/custom_device_test.cc
+++ b/paddle/phi/backends/custom/custom_device_test.cc
@@ -18,6 +18,7 @@

 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
 #include "paddle/phi/backends/custom/fake_cpu_device.h"
 #include "paddle/phi/backends/device_manager.h"

@@ -239,6 +240,7 @@ void TestCustomCCL(const paddle::platform::Place& place) {

 TEST(CustomDevice, Tensor) {
  InitDevice();
+  paddle::framework::InitMemoryMethod();
  auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
  for (const auto& dev_type : dev_types) {
    std::cout << "Test on " << dev_type << std::endl;

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -55,6 +55,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"

 DECLARE_int32(paddle_num_threads);
@@ -84,6 +86,7 @@ namespace framework {
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
 std::once_flag npu_init_flag;
+std::once_flag memory_method_init_flag;

 bool InitGflags(std::vector<std::string> args) {
  bool successed = false;
@@ -456,5 +459,20 @@ void InitGLOG(const std::string &prog_name) {
  });
 }

+void InitMemoryMethod() {
+  std::call_once(memory_method_init_flag, [&]() {
+    auto &memory_utils = phi::MemoryUtils::Instance();
+    auto memory_method = std::make_unique<phi::MemoryInterface>();
+    memory_method->alloc = paddle::memory::Alloc;
+    memory_method->alloc_with_stream = paddle::memory::Alloc;
+    memory_method->alloc_shared = paddle::memory::AllocShared;
+    memory_method->alloc_shared_with_stream = paddle::memory::AllocShared;
+    memory_method->in_same_stream = paddle::memory::InSameStream;
+    memory_method->allocation_deleter =
+        paddle::memory::allocation::Allocator::AllocationDeleter;
+    memory_utils.Init(std::move(memory_method));
+  });
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -39,6 +39,8 @@ void InitDevices();

 void InitDevices(const std::vector<int> devices);

+void InitMemoryMethod();
+
 #ifndef _WIN32
 class SignalMessageDumper {
 public:

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1844,6 +1844,7 @@ All parameter, weight, gradient are variables in Paddle.

  m.def("init_gflags", framework::InitGflags);
  m.def("init_glog", framework::InitGLOG);
+  m.def("init_memory_method", framework::InitMemoryMethod);
  m.def("load_op_meta_info_and_register_op", [](const std::string dso_name) {
    egr::Controller::Instance().MergeOpMetaInfoMap(
        framework::LoadOpMetaInfoAndRegisterOp(dso_name));

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -149,18 +149,24 @@ const Place &Tensor::place() const {
  return impl_->place();
 }

-bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(place()); }
+bool Tensor::is_cpu() const {
+  return place().GetType() == phi::AllocationType::CPU;
+}

-bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(place()); }
+bool Tensor::is_gpu() const {
+  return place().GetType() == phi::AllocationType::GPU;
+}

 bool Tensor::is_gpu_pinned() const {
-  return paddle::platform::is_cuda_pinned_place(place());
+  return place().GetType() == phi::AllocationType::GPUPINNED;
 }

-bool Tensor::is_xpu() const { return paddle::platform::is_xpu_place(place()); }
+bool Tensor::is_xpu() const {
+  return place().GetType() == phi::AllocationType::XPU;
+}

 bool Tensor::is_custom_device() const {
-  return paddle::platform::is_custom_place(place());
+  return place().GetType() == phi::AllocationType::CUSTOM;
 }

 /* Part 4: Data Access methods */

--- a/paddle/phi/api/lib/utils/allocator.h
+++ b/paddle/phi/api/lib/utils/allocator.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"

 namespace paddle {
@@ -23,15 +23,14 @@ namespace experimental {

 class DefaultAllocator : public phi::Allocator {
 public:
-  explicit DefaultAllocator(const paddle::platform::Place& place)
-      : place_(place) {}
+  explicit DefaultAllocator(const phi::Place& place) : place_(place) {}

  AllocationPtr Allocate(size_t bytes_size) override {
-    return memory::Alloc(place_, bytes_size);
+    return phi::memory_utils::Alloc(place_, bytes_size);
  }

 private:
-  paddle::platform::Place place_;
+  phi::Place place_;
 };

 }  // namespace experimental

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -61,10 +61,6 @@ set(phi_modules ${phi_modules} phi_backends)
 set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}")

 if(WITH_CUSTOM_DEVICE)
-  cc_test(
-    custom_device_test
-    SRCS custom/custom_device_test.cc
-    DEPS phi_tensor_utils phi_backends phi_device_context gradient_accumulator)
  cc_test(
    capi_test
    SRCS custom/capi_test.cc

--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -20,3 +20,7 @@ cc_library(
  int_array
  SRCS int_array.cc
  DEPS phi_enforce phi_tensor_utils)
+cc_library(
+  memory_utils
+  SRCS memory_utils.cc
+  DEPS phi_enforce phi_place)
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/common/memory_utils.h"
+
+namespace phi {
+
+namespace memory_utils {
+
+Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+                               size_t size,
+                               const phi::Stream& stream) {
+  return MemoryUtils::Instance().Alloc(place, size, stream);
+}
+
+Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
+  return MemoryUtils::Instance().Alloc(place, size);
+}
+
+std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
+                                        size_t size,
+                                        const phi::Stream& stream) {
+  return MemoryUtils::Instance().AllocShared(place, size, stream);
+}
+
+std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size) {
+  return MemoryUtils::Instance().AllocShared(place, size);
+}
+
+bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                  const phi::Stream& stream) {
+  return MemoryUtils::Instance().InSameStream(allocation, stream);
+}
+
+void AllocationDeleter(Allocation* allocation) {
+  MemoryUtils::Instance().AllocationDeleter(allocation);
+}
+
+}  // namespace memory_utils
+
+}  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/phi/core/stream.h"
+
+namespace phi {
+
+struct MemoryInterface {
+  /**
+   * @brief Allocate a unique allocation.
+   *
+   * @param[phi::Place] place     The target place that will be allocated
+   * @param[size_t]     size      memory size
+   */
+  Allocator::AllocationPtr (*alloc)(const phi::Place& place, size_t size);
+
+  /**
+   * @brief Allocate a unique allocation.
+   *
+   * @param[phi::Place] place     The target gpu place that will be allocated
+   * @param[size_t]     size      memory size
+   * @param[phi::Stream]stream    the stream that is used for allocator
+   */
+
+  Allocator::AllocationPtr (*alloc_with_stream)(const phi::GPUPlace& place,
+                                                size_t size,
+                                                const phi::Stream& stream);
+
+  /**
+   * @brief Allocate a shared allocation.
+   *
+   * @param[phi::Place] place     The target place that will be allocated
+   * @param[size_t]     size      memory size
+   */
+  std::shared_ptr<Allocation> (*alloc_shared)(const phi::Place& place,
+                                              size_t size);
+
+  /**
+   * @brief Allocate a shared allocation.
+   *
+   * @param[phi::Place] place     The target place that will be allocated
+   * @param[size_t]     size      memory size
+   * @param[phi::Stream]stream    the stream that is used for allocator
+   */
+  std::shared_ptr<Allocation> (*alloc_shared_with_stream)(
+      const phi::Place& place, size_t size, const phi::Stream& stream);
+
+  /**
+   * @brief whether the allocation is in the stream
+   *
+   * @param[Allocation] allocation  the allocation to check
+   * @param[phi::Stream]stream      the device's stream
+   */
+  bool (*in_same_stream)(const std::shared_ptr<Allocation>& allocation,
+                         const phi::Stream& stream);
+
+  /**
+   * @brief free allocation
+   *
+   * @param[Allocation] allocation  the allocation to be freed
+   */
+  void (*allocation_deleter)(Allocation* allocation);
+};
+
+class MemoryUtils {
+ public:
+  static MemoryUtils& Instance() {
+    static MemoryUtils g_memory_utils;
+    return g_memory_utils;
+  }
+
+  void Init(std::unique_ptr<MemoryInterface> memory_method) {
+    memory_method_ = std::move(memory_method);
+  }
+
+  Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+                                 size_t size,
+                                 const phi::Stream& stream) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(memory_method_->alloc_with_stream,
+                      nullptr,
+                      phi::errors::Unavailable(
+                          "alloc_with_stream method in memory_method_ is not "
+                          "initiazed yet. You need init it first."));
+    return memory_method_->alloc_with_stream(place, size, stream);
+  }
+
+  Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->alloc,
+        nullptr,
+        phi::errors::Unavailable("alloc method in memory_method_ is not "
+                                 "initiazed yet. You need init it first."));
+    return memory_method_->alloc(place, size);
+  }
+
+  std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
+                                          size_t size,
+                                          const phi::Stream& stream) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(memory_method_->alloc_shared_with_stream,
+                      nullptr,
+                      phi::errors::Unavailable(
+                          "alloc_shared_with_stream method in memory_method_ "
+                          "is not initiazed yet. You need init it first."));
+    return memory_method_->alloc_shared_with_stream(place, size, stream);
+  }
+
+  std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
+                                          size_t size) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->alloc_shared,
+        nullptr,
+        phi::errors::Unavailable("alloc_shared method in memory_method_ is not "
+                                 "initiazed yet. You need init it first."));
+    return memory_method_->alloc_shared(place, size);
+  }
+
+  bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                    const phi::Stream& stream) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->in_same_stream,
+        nullptr,
+        phi::errors::Unavailable("in_same_stream method in memory_method_ is "
+                                 "not initiazed yet. You need init it first."));
+    return memory_method_->in_same_stream(allocation, stream);
+  }
+
+  void AllocationDeleter(Allocation* allocation) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(memory_method_->allocation_deleter,
+                      nullptr,
+                      phi::errors::Unavailable(
+                          "allocation_deleter method in memory_method_ is not "
+                          "initiazed yet. You need init it first."));
+    return memory_method_->allocation_deleter(allocation);
+  }
+
+  void CheckMemoryMethod() {
+    PADDLE_ENFORCE_NE(
+        memory_method_.get(),
+        nullptr,
+        phi::errors::Unavailable("memory_method_ in MemoryUtils is not "
+                                 "initiazed yet. You need init it first."));
+  }
+
+ private:
+  MemoryUtils() = default;
+
+  std::unique_ptr<MemoryInterface> memory_method_ = nullptr;
+
+  DISABLE_COPY_AND_ASSIGN(MemoryUtils);
+};
+
+/*
+  NOTE(YuanRisheng) Why should we add the following code?
+  We need this because MemoryUtils::instance() is a singleton object and we
+  don't recommend using singleton object in kernels. So, we wrap it using a
+  function and if we delete this singleton object in future, it will be easy to
+  change code.
+*/
+
+namespace memory_utils {
+
+Allocator::AllocationPtr Alloc(const phi::GPUPlace& place,
+                               size_t size,
+                               const phi::Stream& stream);
+
+Allocator::AllocationPtr Alloc(const phi::Place& place, size_t size);
+
+std::shared_ptr<Allocation> AllocShared(const phi::Place& place,
+                                        size_t size,
+                                        const phi::Stream& stream);
+
+std::shared_ptr<Allocation> AllocShared(const phi::Place& place, size_t size);
+
+bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                  const phi::Stream& stream);
+
+void AllocationDeleter(Allocation* allocation);
+
+}  // namespace memory_utils
+
+}  // namespace phi
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -68,7 +68,7 @@ cc_library(
  SRCS dense_tensor.cc dense_tensor_impl.cc
  DEPS convert_utils tensor_meta tensor_base ddim)

-target_link_libraries(dense_tensor malloc)
+target_link_libraries(dense_tensor memory_utils)

 cc_library(
  sparse_coo_tensor
@@ -130,20 +130,24 @@ if(WITH_GPU)
  nv_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
+    DEPS phi_backends dense_tensor selected_rows memcpy device_context
+         memory_utils)
 elseif(WITH_ROCM)
  hip_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
+    DEPS phi_backends dense_tensor selected_rows memcpy device_context
+         memory_utils)
 elseif(WITH_XPU_KP)
  xpu_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows malloc memcpy device_context)
+    DEPS phi_backends dense_tensor selected_rows memcpy device_context
+         memory_utils)
 else()
  cc_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS dense_tensor selected_rows malloc memcpy device_context phi_backends)
+    DEPS dense_tensor selected_rows memcpy device_context phi_backends
+         memory_utils)
 endif()
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -37,7 +37,6 @@ limitations under the License. */
 * In the future, the necessary components will be moved to the this library,
 * or the corresponding components will be re-implemented.
 */
-#include "paddle/fluid/memory/malloc.h"

 namespace phi {


--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"

@@ -111,7 +111,7 @@ void* DenseTensor::mutable_data(const Place& place,
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + meta_.offset) {
    holder_.reset();
-    holder_ = paddle::memory::AllocShared(place, size);
+    holder_ = memory_utils::AllocShared(place, size);
    meta_.offset = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -140,9 +140,9 @@ void* DenseTensor::mutable_data(const Place& place,
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + meta_.offset ||
      !(place.GetType() == phi::AllocationType::GPU &&
-        paddle::memory::InSameStream(holder_, stream))) {
+        memory_utils::InSameStream(holder_, stream))) {
    holder_.reset();
-    holder_ = paddle::memory::AllocShared(place, size, stream);
+    holder_ = memory_utils::AllocShared(place, size, stream);
    meta_.offset = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +

--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -22,9 +22,9 @@ limitations under the License. */
 #include <vector>

 #include "glog/logging.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"

@@ -59,7 +59,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void *src = cpu_->data();
  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
-  (*gpu_) = paddle::memory::Alloc(place, *gpu_memory_size_);
+  (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
  void *dst = (*gpu_)->ptr();
  auto *dev_ctx = static_cast<phi::GPUContext *>(
      phi::DeviceContextPool::Instance().Get(place));

--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/core/string_tensor.h"

-#include "paddle/fluid/memory/malloc.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/pstring.h"

 namespace phi {
@@ -190,7 +190,7 @@ dtype::pstring* StringTensor::mutable_data(const phi::Place& place,
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + meta_.offset) {
    holder_.reset();
-    holder_ = paddle::memory::AllocShared(place, size);
+    holder_ = memory_utils::AllocShared(place, size);
    // Initialize the allocated bytes
    init_holder();
    meta_.offset = 0;

--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"


--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"

 namespace phi {
@@ -94,12 +94,12 @@ struct PointerToPointer {
  PointerToPointer(const phi::GPUContext& ctx,
                   const std::vector<phi::DenseTensor>& ins,
                   const T** pre_alloced_host_ptr,
-                   paddle::memory::AllocationPtr* dev_ins_ptr) {
+                   phi::Allocator::AllocationPtr* dev_ins_ptr) {
    auto in_num = ins.size();
    for (auto i = 0; i < in_num; ++i) {
      pre_alloced_host_ptr[i] = ins[i].data<T>();
    }
-    *dev_ins_ptr = paddle::memory::Alloc(
+    *dev_ins_ptr = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        in_num * sizeof(T*),
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
@@ -147,9 +147,9 @@ struct PointerToPointerAndCol {
                         const IndexT inputs_col_num,
                         const T** pre_alloced_host_ptr,
                         IndexT* inputs_col,
-                         paddle::memory::AllocationPtr* dev_ins_ptr,
-                         paddle::memory::AllocationPtr* dev_col_ptr) {
-    *dev_col_ptr = paddle::memory::Alloc(
+                         phi::Allocator::AllocationPtr* dev_ins_ptr,
+                         phi::Allocator::AllocationPtr* dev_col_ptr) {
+    *dev_col_ptr = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        inputs_col_num * sizeof(IndexT),
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
@@ -279,8 +279,8 @@ void DispatchConcatWithDifferentShapeKernelLimitNum(
        <<<grid_dims, block_dims, 0, ctx.stream()>>>(
            ptr_col_array, inputs_col_num, out_row, out_col, output->data()));
    default: {
-      paddle::memory::AllocationPtr dev_ins_ptr{nullptr};
-      paddle::memory::AllocationPtr dev_col_ptr{nullptr};
+      phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
+      phi::Allocator::AllocationPtr dev_col_ptr{nullptr};
      PointerToPointerAndCol<T, IndexT> ptr_col_array(ctx,
                                                      ins,
                                                      inputs_col_num,
@@ -396,7 +396,7 @@ void DispatchConcatWithSameShapeKernelLimitNum(
        <<<grid_dims, block_dims, 0, ctx.stream()>>>(
            ptr_array, in_col, out_row, out_col, output->data()));
    default: {
-      paddle::memory::AllocationPtr dev_ins_ptr{nullptr};
+      phi::Allocator::AllocationPtr dev_ins_ptr{nullptr};
      PointerToPointer<T> ptr_array(ctx, ins, inputs_data, &dev_ins_ptr);
      ConcatTensorWithSameShape<IndexT, MovSize, decltype(ptr_array)>
          <<<grid_dims, block_dims, 0, ctx.stream()>>>(
@@ -570,10 +570,10 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
  IndexT* inputs_col = inputs_col_vec.data();
 #ifdef PADDLE_WITH_HIP
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  paddle::memory::AllocationPtr data_alloc = paddle::memory::Alloc(
+  phi::Allocator::AllocationPtr data_alloc = phi::memory_utils::Alloc(
      paddle::platform::CUDAPinnedPlace(), in_num * sizeof(T*));
  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  paddle::memory::AllocationPtr col_alloc = paddle::memory::Alloc(
+  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
      paddle::platform::CUDAPinnedPlace(), inputs_col_num * sizeof(IndexT));
  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
 #endif
@@ -609,10 +609,8 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
  ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
    VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
    VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        col_alloc_released);
+    phi::memory_utils::AllocationDeleter(data_alloc_released);
+    phi::memory_utils::AllocationDeleter(col_alloc_released);
  });
 #endif
 }
@@ -786,14 +784,14 @@ void SplitFunctorDispatchWithIndexType(
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-  paddle::memory::AllocationPtr data_alloc, cols_alloc;
+  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     out_num * sizeof(T*));
+  data_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                        out_num * sizeof(T*));
  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
-                                     (out_cols_num) * sizeof(IndexT));
+  cols_alloc = phi::memory_utils::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                        (out_cols_num) * sizeof(IndexT));
  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
 #endif

@@ -840,10 +838,8 @@ void SplitFunctorDispatchWithIndexType(
  auto* data_alloc_released = data_alloc.release();
  auto* cols_alloc_released = cols_alloc.release();
  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        data_alloc_released);
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        cols_alloc_released);
+    phi::memory_utils::AllocationDeleter(data_alloc_released);
+    phi::memory_utils::AllocationDeleter(cols_alloc_released);
  });
 #endif
 }

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
@@ -1533,7 +1534,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  // One part buffer for x_strides_array, rest for y_strides_array and
  // out_dims_array.
  size_t tmp_total_bytes = bytes * 3;
-  auto tmp_buffer = paddle::memory::Alloc(
+  auto tmp_buffer = phi::memory_utils::Alloc(
      ctx.GetPlace(),
      tmp_total_bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
@@ -1564,7 +1565,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
  if (dx) {
    size_t dx_total_bytes = bytes * 2;
-    auto dx_tmp_buffer = paddle::memory::Alloc(
+    auto dx_tmp_buffer = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        dx_total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
@@ -1603,7 +1604,7 @@ void CommonGradBroadcastCUDA(const DenseTensor &x,
  if (dy) {
    // One part buffer for y_strides_order_gpu, the other for y_dims_order_gpu
    size_t dy_total_bytes = bytes * 2;
-    auto dy_tmp_buffer = paddle::memory::Alloc(
+    auto dy_tmp_buffer = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        dy_total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -25,13 +25,11 @@ namespace cub = hipcub;
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/reverse_iterator.h>

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/for_range.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/malloc.h"
-
 namespace phi {
 namespace funcs {

@@ -50,7 +48,7 @@ static void CubInclusiveScan(InputIterator x_iter,
                             size_t n,
                             BinaryOp op,
                             const phi::GPUContext &dev_ctx) {
-  paddle::memory::allocation::AllocationPtr allocation;
+  phi::Allocator::AllocationPtr allocation;
  void *temp_storage = nullptr;
  size_t temp_storage_bytes = 0;
  for (size_t i = 0; i < 2; ++i) {
@@ -64,7 +62,7 @@ static void CubInclusiveScan(InputIterator x_iter,
                                       dev_ctx.stream()));
    if (i == 0 && temp_storage_bytes > 0) {
      allocation =
-          paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
+          phi::memory_utils::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
      temp_storage = allocation->ptr();
    }
  }

--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -24,9 +24,9 @@ namespace cub = hipcub;

 #include <iostream>

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"

@@ -1923,11 +1923,11 @@ static void LayerNormBackward(
        constexpr int part_size = BDIMY2 * VPT;
        const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1);

-        auto part_grad_gamma_ptr = paddle::memory::Alloc(
+        auto part_grad_gamma_ptr = phi::memory_utils::Alloc(
            dev_ctx.GetPlace(),
            part_size * feature_size * sizeof(U),
            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-        auto part_grad_beta_ptr = paddle::memory::Alloc(
+        auto part_grad_beta_ptr = phi::memory_utils::Alloc(
            dev_ctx.GetPlace(),
            part_size * feature_size * sizeof(U),
            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -14,12 +14,12 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/math_function_impl.h"
@@ -191,8 +191,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
  const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
  size_t size = 3 * rank * sizeof(int64_t);
-  auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
-  auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+  auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
+  auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
  REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
  REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
  for (int i = 0; i < rank; ++i) {
@@ -234,8 +234,8 @@ struct TransposeNormal<phi::GPUContext, T> {
    const phi::GPUPlace& cuda_place = context.GetPlace();
    phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
    size_t size = 3 * rank * sizeof(int64_t);
-    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
-    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
+    auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
    for (int i = 0; i < rank; ++i) {

--- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/matrix_inverse.h"

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

@@ -31,12 +30,12 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
  int n = mat_dims[rank - 1];
  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;

-  paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data;
+  phi::Allocator::AllocationPtr tmp_gpu_mat_data;
  const T* gpu_mat = a.data<T>();
  if (n >= 32) {
    // Copy all elements of input matrix A to a temporary memory space to
    // avoid being overriden by getrf.
-    tmp_gpu_mat_data = paddle::memory::Alloc(
+    tmp_gpu_mat_data = phi::memory_utils::Alloc(
        dev_ctx.GetPlace(),
        a.numel() * sizeof(T),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -59,11 +58,10 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
  // and allocate device memory for info and pivots.
  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
  size_t total_bytes = cpu_ptrs.size() * sizeof(T*) + num_ints * sizeof(int);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          dev_ctx.GetPlace(),
-          total_bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      total_bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
  paddle::memory::Copy(dev_ctx.GetPlace(),
                       tmp_gpu_ptrs_data->ptr(),
                       phi::CPUPlace(),

--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/kernels/funcs/matrix_solve.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -79,11 +80,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
  }

  // Copy the addresses of A and tmp_b from host to device.
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          context.GetPlace(),
-          cpu_ptrs.size() * sizeof(T*),
-          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      context.GetPlace(),
+      cpu_ptrs.size() * sizeof(T*),
+      phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
  paddle::memory::Copy(context.GetPlace(),
                       tmp_gpu_ptrs_data->ptr(),
                       phi::CPUPlace(),
@@ -96,11 +96,10 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,

  // Allocate device memory for BatchedGETRF's info and pivots.
  int num_ints = n < 32 ? batch_size : batch_size * (n + 1);
-  paddle::memory::allocation::AllocationPtr tmp_gpu_info_data =
-      paddle::memory::Alloc(
-          context.GetPlace(),
-          num_ints * sizeof(int),
-          phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
+  phi::Allocator::AllocationPtr tmp_gpu_info_data = phi::memory_utils::Alloc(
+      context.GetPlace(),
+      num_ints * sizeof(int),
+      phi::Stream(reinterpret_cast<phi::StreamId>(context.stream())));
  int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());

  auto blas = phi::funcs::GetBlas<Context, T>(context);

--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -15,6 +15,7 @@
 #pragma once

 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"

 namespace phi {
@@ -107,7 +108,7 @@ struct ArraySetterBase {
                     void* src,
                     size_t num_bytes,
                     bool use_cuda_graph = false) {
-    allocation = paddle::memory::Alloc(
+    allocation = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        num_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -26,7 +26,6 @@ namespace cub = hipcub;

 #include <algorithm>

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/ddim.h"

--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -14,10 +14,10 @@

 #pragma once

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/dynload/cusparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -337,7 +337,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
                                          &buffer_size);
  });

-  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+  phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
      dev_ctx_.GetPlace(),
      buffer_size,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
@@ -389,7 +389,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
                                          &buffer_size);
  });

-  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+  phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
      dev_ctx_.GetPlace(),
      buffer_size,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
@@ -443,7 +443,7 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
                                           &buffer_size);
  });

-  paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc(
+  phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
      dev_ctx_.GetPlace(),
      buffer_size,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));

--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -20,6 +20,7 @@
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
@@ -405,7 +406,7 @@ struct MatrixEighFunctor<GPUContext, T> {
                &workspace_size);
    }
    size_t total_bytes = sizeof(T) * workspace_size + sizeof(int) * batch_size;
-    auto work = paddle::memory::Alloc(
+    auto work = phi::memory_utils::Alloc(
        dev_ctx.GetPlace(),
        total_bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
 #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_add_relu.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h"
 #include "cutlass/epilogue/thread/linear_combination_residual_block.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -111,11 +112,10 @@ cutlass::Status Conv2dBiasAddReluImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_leaky_relu.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
 #include "cutlass/epilogue/thread/linear_combination_leaky_relu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasLeakyReluImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
 #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasReluImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_relu_few_channels.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
 #include "cutlass/epilogue/thread/linear_combination_bias_relu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -115,11 +116,10 @@ cutlass::Status Conv2dBiasReluFewChannelsImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_bias_silu.cu
@@ -15,6 +15,7 @@
 #include <mutex>
 #include "cutlass/conv/kernel/default_conv2d_fprop.h"
 #include "cutlass/epilogue/thread/linear_combination_silu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"

 namespace phi {
@@ -114,11 +115,10 @@ cutlass::Status Conv2dBiasSiluImpl(ConvAllParams params) {

  auto ctx = params.ctx;
  auto stream = ctx->stream();
-  paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-      paddle::memory::Alloc(
-          ctx->GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
+  phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+      ctx->GetPlace(),
+      bytes,
+      phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
  void *workspace = tmp_gpu_ptrs_data->ptr();

  cutlass::Status status = implicit_gemm_op.can_implement(arguments);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -22,8 +22,6 @@

 #include "cutlass/conv/device/implicit_gemm_convolution.h"

-#include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/enforce.h"


--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,9 +14,9 @@

 #include "paddle/phi/kernels/add_n_kernel.h"

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 namespace phi {

@@ -205,7 +205,7 @@ void AddNKernel(const Context &dev_ctx,
      }
    }
    if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array = paddle::memory::Alloc(
+      auto tmp_sr_in_out_array = phi::memory_utils::Alloc(
          dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));

      paddle::memory::Copy(dev_ctx.GetPlace(),
@@ -226,8 +226,8 @@ void AddNKernel(const Context &dev_ctx,
  }
  // if indata not null, merge into one kernel call.
  if (!in_data.empty()) {
-    auto tmp_in_array =
-        paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
+    auto tmp_in_array = phi::memory_utils::Alloc(dev_ctx.GetPlace(),
+                                                 in_data.size() * sizeof(T *));

    paddle::memory::Copy(dev_ctx.GetPlace(),
                         tmp_in_array->ptr(),

--- a/paddle/phi/kernels/gpu/amp_kernel.cu
+++ b/paddle/phi/kernels/gpu/amp_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/amp_kernel.h"

 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
@@ -159,10 +160,10 @@ class LazyZeros<phi::GPUContext, T> {
    const auto& cpu_place = phi::CPUPlace();
    // alloc each tensor's start index and copy to device
    auto h_in_starts_mem =
-        paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+        phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
    int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());

-    auto d_in_starts_mem = paddle::memory::Alloc(
+    auto d_in_starts_mem = phi::memory_utils::Alloc(
        dev_ctx.GetPlace(),
        (xs_size + 1) * sizeof(int64_t),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -184,10 +185,10 @@ class LazyZeros<phi::GPUContext, T> {

    // copy each tensor of "outs" data address array to device
    auto h_out_addrs_mem =
-        paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*));
+        phi::memory_utils::Alloc(cpu_place, xs_size * sizeof(T*));
    T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());

-    auto d_out_addrs_mem = paddle::memory::Alloc(
+    auto d_out_addrs_mem = phi::memory_utils::Alloc(
        dev_ctx.GetPlace(),
        xs_size * sizeof(T*),
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -288,10 +289,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
  const auto& cpu_place = phi::CPUPlace();
  // calculate each tensor's start index and copy to device
  auto h_starts_tensor =
-      paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+      phi::memory_utils::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
  int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());

-  auto d_starts_tensor = paddle::memory::Alloc(
+  auto d_starts_tensor = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      (xs_size + 1) * sizeof(int64_t),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -313,11 +314,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
                       dev_ctx.stream());

  // copy each tensor's data address to device
-  auto h_mem = paddle::memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
+  auto h_mem = phi::memory_utils::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
  const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
  T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;

-  auto d_mem = paddle::memory::Alloc(
+  auto d_mem = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      2 * xs_size * sizeof(T*),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/box_coder.cu
+++ b/paddle/phi/kernels/gpu/box_coder.cu
@@ -20,6 +20,7 @@
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/box_coder.h"

@@ -199,7 +200,7 @@ void BoxCoderKernel(const Context &dev_ctx,
  int grid = (row * col + block - 1) / block;

  int bytes = var_size * sizeof(float);
-  auto dev_var = paddle::memory::Alloc(
+  auto dev_var = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/for_range.h"

@@ -81,7 +82,7 @@ struct MatrixBandPartFunctor {
    int workspace_size = 0;                                              \
    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \
        handle, uplo, n, A, lda, &workspace_size));                      \
-    auto workspace = paddle::memory::Alloc(                              \
+    auto workspace = phi::memory_utils::Alloc(                           \
        dev_ctx.GetPlace(),                                              \
        workspace_size,                                                  \
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()))); \
@@ -149,7 +150,7 @@ void CholeskyKernel(const Context& dev_ctx,
    for_range(matrix_band_part_functor);
  }

-  auto info = paddle::memory::Alloc(
+  auto info = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      sizeof(int) * batch_count,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -23,6 +23,8 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/distribute_fpn_proposals_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h"
@@ -30,9 +32,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

-#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"

 namespace phi {

@@ -188,7 +188,7 @@ void DistributeFpnProposalsKernel(
                                            sizeof(int) * 8,
                                            dev_ctx.stream());
  // Allocate temporary storage
-  auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);

  // Run sorting operation
  // sort target level to get corresponding index

--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -13,8 +13,6 @@
 // limitations under the License.

 #include "paddle/phi/kernels/flip_kernel.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/place.h"

--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -25,6 +25,7 @@ namespace cub = hipcub;
 #endif

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/detection/bbox_util.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -72,7 +73,7 @@ static void SortDescending(const phi::GPUContext &ctx,
                                                    ctx.stream());
  // Allocate temporary storage
  auto place = ctx.GetPlace();
-  auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);

  // Run sorting operation
  cub::DeviceRadixSort::SortPairsDescending<T, int>(d_temp_storage->ptr(),
@@ -297,7 +298,7 @@ static void NMS(const phi::GPUContext &ctx,

  const T *boxes = proposals.data<T>();
  auto place = ctx.GetPlace();
-  auto mask_ptr = paddle::memory::Alloc(
+  auto mask_ptr = phi::memory_utils::Alloc(
      place,
      boxes_num * col_blocks * sizeof(uint64_t),
      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -30,6 +30,8 @@ namespace cub = hipcub;

 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/graph_reindex_funcs.h"

@@ -70,7 +72,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
      input, num_input, len_hashtable, keys, key_index);

  // Get item index count.
-  auto item_count = paddle::memory::Alloc(place, (num_input + 1) * sizeof(int));
+  auto item_count =
+      phi::memory_utils::Alloc(place, (num_input + 1) * sizeof(int));
  int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
 #ifdef PADDLE_WITH_HIP
  hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
@@ -83,7 +86,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
  size_t temp_storage_bytes = 0;
  cub::DeviceScan::ExclusiveSum(
      NULL, temp_storage_bytes, item_count_ptr, item_count_ptr, num_input + 1);
-  auto d_temp_storage = paddle::memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage = phi::memory_utils::Alloc(place, temp_storage_bytes);
  cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
                                temp_storage_bytes,
                                item_count_ptr,
@@ -103,7 +106,7 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
 #endif

  auto unique_items =
-      paddle::memory::AllocShared(place, total_unique_items * sizeof(T));
+      phi::memory_utils::AllocShared(place, total_unique_items * sizeof(T));
  T* unique_items_data = reinterpret_cast<T*>(unique_items->ptr());
  *final_nodes_len = total_unique_items;

@@ -217,11 +220,12 @@ void Reindex(const Context& dev_ctx,
  int64_t log_num = 1 << static_cast<size_t>(1 + std::log2(num >> 1));
  int64_t table_size = log_num << 1;

-  auto keys = paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
+  auto keys =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(T));
  auto values =
-      paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
  auto key_index =
-      paddle::memory::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), table_size * sizeof(int));
  T* keys_ptr = reinterpret_cast<T*>(keys->ptr());
  int* values_ptr = reinterpret_cast<int*>(values->ptr());
  int* key_index_ptr = reinterpret_cast<int*>(key_index->ptr());

--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -15,11 +15,11 @@
 #ifndef PADDLE_WITH_HIP
 // HIP not support cusolver

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
 #include "paddle/phi/kernels/lu_kernel.h"

@@ -105,7 +105,7 @@ void lu_decomposed_kernel(const Context& dev_ctx,
  int lwork;
  cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork);

-  auto work_buff = paddle::memory::Alloc(
+  auto work_buff = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(T),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
@@ -28,6 +28,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
 #include "paddle/phi/kernels/margin_cross_entropy_grad_kernel.h"

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
@@ -109,7 +110,8 @@ void GetClassInterval(const gpuStream_t& stream,
  size_t cub_temp_storage_bytes = 0;
  cub::DeviceScan::InclusiveSum<int*, int*>(
      nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
-  auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes);
+  auto cub_temp_storage =
+      phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
  cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
                                            cub_temp_storage_bytes,
                                            num_classes_per_device_ptr,

--- a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
@@ -33,6 +33,7 @@ namespace cub = hipcub;
 #endif
 // trace op include
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

 namespace phi {
@@ -104,7 +105,8 @@ void GetClassInterval(const gpuStream_t& stream,
  size_t cub_temp_storage_bytes = 0;
  cub::DeviceScan::InclusiveSum<int*, int*>(
      nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
-  auto cub_temp_storage = paddle::memory::Alloc(place, cub_temp_storage_bytes);
+  auto cub_temp_storage =
+      phi::memory_utils::Alloc(place, cub_temp_storage_bytes);
  cub::DeviceScan::InclusiveSum<int*, int*>(cub_temp_storage->ptr(),
                                            cub_temp_storage_bytes,
                                            num_classes_per_device_ptr,

--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -22,6 +22,7 @@

 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/abs_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
@@ -92,7 +93,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                            ldt,
                                            &lwork,
                                            gesvdj_params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(float),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -171,7 +172,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                            ldt,
                                            &lwork,
                                            gesvdj_params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(double),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -235,7 +236,7 @@ void SyevjBatched<float>(const phi::GPUContext& dev_ctx,
  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(float),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -290,7 +291,7 @@ void SyevjBatched<double>(const phi::GPUContext& dev_ctx,
  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(&params));
  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize(
      handle, jobz, uplo, n, A, lda, W, &lwork, params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(double),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -352,7 +353,7 @@ void MatrixRankTolKernel(const Context& dev_ctx,
  // Must Copy X once, because the gesvdj will destory the content when exit.
  DenseTensor x_tmp;
  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &x_tmp);
-  auto info = paddle::memory::Alloc(
+  auto info = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      sizeof(int) * batches,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/nanmedian_kernel.cu
+++ b/paddle/phi/kernels/gpu/nanmedian_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/nanmedian_kernel_impl.h"
@@ -176,7 +177,7 @@ void ProcessMedianKernel(const Context& dev_ctx,
                                   nan_counts_ptr);

    auto nan_stat_mem_cpu =
-        paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
+        phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 2);
    int64_t* nan_stat_cpu_ptr =
        reinterpret_cast<int64_t*>(nan_stat_mem_cpu->ptr());
    paddle::memory::Copy(phi::CPUPlace(),

--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -14,10 +14,10 @@

 #include "paddle/phi/kernels/nms_kernel.h"

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -75,7 +75,7 @@ void NMSKernel(const Context& dev_ctx,
  const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
  dim3 block(threadsPerBlock);
  dim3 grid(blocks_per_line, blocks_per_line);
-  auto mask_data = paddle::memory::Alloc(
+  auto mask_data = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      num_boxes * blocks_per_line * sizeof(uint64_t),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -26,14 +26,12 @@
 namespace cub = hipcub;
 #endif
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/randint_kernel.h"

-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/memcpy.h"
-
 DECLARE_bool(use_curand);

 namespace phi {
@@ -127,7 +125,7 @@ void RandpermRawKernel(
                                          end_bit < 32 ? end_bit : 32,
                                          dev_ctx.stream());

-  auto d_temp_storage = paddle::memory::Alloc(
+  auto d_temp_storage = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      temp_storage_bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -216,7 +217,7 @@ void RoiAlignGradKernel(const Context& dev_ctx,
      }
    }
  }
-  auto roi_ptr = paddle::memory::Alloc(
+  auto roi_ptr = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      box_batch_id_list.numel() * sizeof(int),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/roi_align_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu
@@ -17,6 +17,7 @@
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"

@@ -227,7 +228,7 @@ void RoiAlignKernel(const Context& dev_ctx,
    }
  }
  int bytes = roi_batch_id_list.numel() * sizeof(int);
-  auto roi_ptr = paddle::memory::Alloc(
+  auto roi_ptr = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -120,7 +121,7 @@ void RoiPoolGradKernel(const Context& dev_ctx,
      }
    }
    int bytes = box_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = paddle::memory::Alloc(
+    auto roi_ptr = phi::memory_utils::Alloc(
        dev_ctx.GetPlace(),
        bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu
@@ -17,6 +17,7 @@
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"

@@ -184,7 +185,7 @@ void RoiPoolKernel(const Context& dev_ctx,
  }

  int bytes = box_batch_id_list.numel() * sizeof(int);
-  auto box_ptr = paddle::memory::Alloc(
+  auto box_ptr = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -16,7 +16,6 @@

 #include <algorithm>

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/core/hostdevice.h"

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu
@@ -14,6 +14,7 @@

 #include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_grad_kernel.h"

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"

 namespace phi {
@@ -87,7 +88,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx,
    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
-    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
    paddle::memory::Copy(phi::CPUPlace(),
                         norm_cpu_ptr,

--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu
@@ -14,6 +14,7 @@

 #include "paddle/phi/kernels/sigmoid_cross_entropy_with_logits_kernel.h"

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h"

 namespace phi {
@@ -86,7 +87,7 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx,
    funcs::ReduceKernel<T, T, kps::AddFunctor, NonzeroFunctor<T>>(
        dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor<T>(), reduce_dim);
    T *norm = dev_ctx.template Alloc<T>(norm_tensor);
-    auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T));
+    auto norm_cpu_mem = phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(T));
    T *norm_cpu_ptr = reinterpret_cast<T *>(norm_cpu_mem->ptr());
    paddle::memory::Copy(phi::CPUPlace(),
                         norm_cpu_ptr,

--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -19,6 +19,7 @@

 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
@@ -77,7 +78,7 @@ void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
                                                 ldt,
                                                 &lwork,
                                                 gesvdj_params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(float),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
@@ -158,7 +159,7 @@ void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
                                                 ldt,
                                                 &lwork,
                                                 gesvdj_params));
-  auto workspace = paddle::memory::Alloc(
+  auto workspace = phi::memory_utils::Alloc(
      dev_ctx.GetPlace(),
      lwork * sizeof(double),
      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -14,6 +14,7 @@

 #include "paddle/phi/kernels/sync_batch_norm_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/sync_batch_norm_utils.h"

@@ -84,7 +85,7 @@ void SyncBatchNormKernel(const Context &ctx,
    // x, x^2, 1, here 1 is used to calc device num
    // device num also can be got from platform::DeviceContextPool
    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-    alloc_ptr = paddle::memory::Alloc(
+    alloc_ptr = phi::memory_utils::Alloc(
        ctx.GetPlace(),
        bytes,
        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h
@@ -30,10 +30,10 @@ namespace cub = hipcub;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"

@@ -481,7 +481,7 @@ void SyncBatchNormGradFunctor(
  const auto *saved_inv_var =
      saved_variance.template data<BatchNormParamType<T>>();
  const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
-  auto alloc_ptr = paddle::memory::Alloc(
+  auto alloc_ptr = phi::memory_utils::Alloc(
      ctx.GetPlace(),
      bytes,
      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));

--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/triangular_solve_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -23,7 +24,6 @@
 #include "paddle/phi/kernels/funcs/common_shape.h"

 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/memory.h"

 namespace phi {
@@ -93,11 +93,10 @@ void TriangularSolveKernel(const Context& dev_ctx,
    }

    // Copy the addresses of A and tmp_b from host to device.
-    paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data =
-        paddle::memory::Alloc(
-            dev_ctx.GetPlace(),
-            cpu_ptrs.size() * sizeof(T*),
-            phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+    phi::Allocator::AllocationPtr tmp_gpu_ptrs_data = phi::memory_utils::Alloc(
+        dev_ctx.GetPlace(),
+        cpu_ptrs.size() * sizeof(T*),
+        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));

    paddle::memory::Copy(dev_ctx.GetPlace(),
                         tmp_gpu_ptrs_data->ptr(),

--- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu
+++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu
@@ -14,7 +14,6 @@

 #include "paddle/phi/kernels/yolo_box_kernel.h"

-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"

--- a/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
+++ b/paddle/phi/kernels/impl/average_accumulates_kernel_impl.h
@@ -17,6 +17,7 @@ limitations under the License. */

 #include <algorithm>

+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"

@@ -48,17 +49,17 @@ void AverageAccumulatesKernel(const Context& dev_ctx,
  // int64_t old_num_accumulates = 0;

  auto num_updates_cpu =
-      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+      phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
  int64_t* num_updates_cpu_ptr =
      reinterpret_cast<int64_t*>(num_updates_cpu->ptr());

  auto num_accumulates_cpu =
-      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+      phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
  int64_t* num_accumulates_cpu_ptr =
      reinterpret_cast<int64_t*>(num_accumulates_cpu->ptr());

  auto old_num_accumulates_cpu =
-      paddle::memory::Alloc(phi::CPUPlace(), sizeof(int64_t));
+      phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t));
  int64_t* old_num_accumulates_cpu_ptr =
      reinterpret_cast<int64_t*>(old_num_accumulates_cpu->ptr());


--- a/paddle/phi/kernels/roi_pool_grad_kernel.h
+++ b/paddle/phi/kernels/roi_pool_grad_kernel.h
@@ -20,15 +20,15 @@
 namespace phi {

 template <typename T, typename Context>
-void RoiPooGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& boxes,
-                      const paddle::optional<DenseTensor>& boxes_num,
-                      const DenseTensor& arg_max,
-                      const DenseTensor& out_grad,
-                      int pooled_height,
-                      int pooled_width,
-                      float spatial_scale,
-                      DenseTensor* dx);
+void RoiPoolGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& boxes,
+                       const paddle::optional<DenseTensor>& boxes_num,
+                       const DenseTensor& arg_max,
+                       const DenseTensor& out_grad,
+                       int pooled_height,
+                       int pooled_width,
+                       float spatial_scale,
+                       DenseTensor* dx);

 }  // namespace phi
--- a/paddle/phi/tests/core/test_sparse_csr_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
@@ -62,7 +62,7 @@ TEST(sparse_csr_tensor, construct) {
  CHECK_EQ(sparse.numel(), 9);
  CHECK(sparse.dims() == dense_dims);
  CHECK(sparse.dtype() == DataType::FLOAT32);
-  CHECK(sparse.place() == paddle::platform::CPUPlace());
+  CHECK(sparse.place() == phi::CPUPlace());
  CHECK(sparse.initialized() == true);
 }


--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"

 #include "gtest/gtest.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/all_context.h"
-
+#include "paddle/phi/common/memory_utils.h"
 namespace phi {
 namespace tests {

@@ -94,7 +93,7 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

-  auto src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
+  auto src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));

  int* gpu_src = reinterpret_cast<int*>(src_allocation->ptr());
  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());
@@ -102,7 +101,7 @@ TEST(StridedMemcpy, GPUCrop) {
  phi::DDim src_stride({5, 1});

  int dst[4];
-  auto dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
+  auto dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
  int* gpu_dst = reinterpret_cast<int*>(dst_allocation->ptr());

  phi::DDim dst_dim({2, 2});
@@ -134,12 +133,12 @@ TEST(StridedMemcpy, GPUConcat) {
  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
  auto* ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(phi::GPUPlace()));

-  auto gpu_src_allocation = paddle::memory::Alloc(gpu0, sizeof(src));
+  auto gpu_src_allocation = phi::memory_utils::Alloc(gpu0, sizeof(src));
  int* gpu_src = reinterpret_cast<int*>(gpu_src_allocation->ptr());
  paddle::memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx->stream());

  int dst[8];
-  auto gpu_dst_allocation = paddle::memory::Alloc(gpu0, sizeof(dst));
+  auto gpu_dst_allocation = phi::memory_utils::Alloc(gpu0, sizeof(dst));
  int* gpu_dst = reinterpret_cast<int*>(gpu_dst_allocation->ptr());

  phi::DDim src_stride({2, 1});

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -97,6 +97,7 @@ int main(int argc, char** argv) {
  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
      &new_argc, &new_argv_address, false);
  paddle::framework::InitDevices();
+  paddle::framework::InitMemoryMethod();
  paddle::framework::InitDefaultKernelSignatureMap();

  int ret = RUN_ALL_TESTS();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -236,6 +236,7 @@ def __bootstrap__():
    core.init_devices()
    core.eager._init_eager_and_static_tensor_operants()
    core.init_default_kernel_signatures()
+    core.init_memory_method()


 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.