[PHI Decoupling]Remove memory header (Part3) (#51288)

* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs * decouple memory * deal with conflict * fix xpu compile bugs * fix xpu bugs * deal with xpu bugs * fix cmake bugs * fix windows bugs * fix ci bugs * fix ci bugs * delete redundance code * add code for pybind * fix py3 bugs * fix ci bugs

[PHI Decoupling]Remove memory header (Part3) (#51288)
* decouple memory copy * fix ci bugs * fix ci compile bugs * fix rocm compile * fix ci bugs * decouple memory * deal with conflict * fix xpu compile bugs * fix xpu bugs * deal with xpu bugs * fix cmake bugs * fix windows bugs * fix ci bugs * fix ci bugs * delete redundance code * add code for pybind * fix py3 bugs * fix ci bugs
3d78e759 · YuanRisheng · GitHub · 3ab19ab4 · 3d78e759 · 3d78e759
22 changed file
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -541,8 +541,8 @@ bool AnalysisPredictor::PrepareScope(
    scope_ = parent_scope;
    status_is_cloned_ = true;
  } else {
-    paddle::framework::InitDevices();
    paddle::framework::InitMemoryMethod();
+    paddle::framework::InitDevices();
    paddle::framework::InitDefaultKernelSignatureMap();
    // TODO(wilber): we need to release memory occupied by weights.
    scope_.reset(new paddle::framework::Scope());

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -94,8 +94,8 @@ bool NativePaddlePredictor::Init(
                            platform::errors::PreconditionNotMet(
                                "The sub_scope should not be nullptr."));
  } else {
-    paddle::framework::InitDevices();
    paddle::framework::InitMemoryMethod();
+    paddle::framework::InitDevices();
    paddle::framework::InitDefaultKernelSignatureMap();
    scope_.reset(new paddle::framework::Scope());
  }

--- a/paddle/fluid/memory/buffer.h
+++ b/paddle/fluid/memory/buffer.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <type_traits>
-
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-class Buffer {
- public:
-  explicit Buffer(const platform::Place &place) : place_(place) {}
-
-  template <typename T>
-  T *Alloc(size_t size) {
-    using AllocT = typename std::
-        conditional<std::is_same<T, void>::value, uint8_t, T>::type;
-    if (UNLIKELY(size == 0)) return nullptr;
-    size *= sizeof(AllocT);
-    if (allocation_ == nullptr || allocation_->size() < size) {
-      allocation_ = memory::Alloc(place_, size);
-    }
-    return reinterpret_cast<T *>(allocation_->ptr());
-  }
-
-  template <typename T>
-  const T *Get() const {
-    return reinterpret_cast<const T *>(
-        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
-  }
-
-  template <typename T>
-  T *GetMutable() {
-    return reinterpret_cast<T *>(
-        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
-  }
-
-  size_t Size() const { return allocation_ ? allocation_->size() : 0; }
-
-  platform::Place GetPlace() const { return place_; }
-
- private:
-  AllocationPtr allocation_;
-  platform::Place place_;
-};
-
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -14,7 +14,6 @@

 #include <cmath>

-#include "paddle/fluid/memory/buffer.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/cast_with_ptr.h"
 #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h"
@@ -22,6 +21,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/tensor_to_string.h"
@@ -191,7 +191,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
          << " , tensor_num = " << n;

  using MT = MasterT<InT>;
-  memory::Buffer tmp_out(place);
+  phi::memory_utils::Buffer tmp_out(place);
  auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
  FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);

@@ -950,7 +950,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
      std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
  bool should_destroy_op =
      scale && CreatePreMulScaleOpIfSupported(dtype, comm, scale, &op);
-  memory::Buffer buffer(dev_ctx.GetPlace());
+  phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
  if (scale && !should_destroy_op) {
    T *new_sendbuff = buffer.Alloc<T>(numel);
    LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
@@ -1012,7 +1012,7 @@ static void CubDeviceReduce(InputIteratorT d_in,
                            ReduceOpT reduction_op,
                            T init,
                            gpuStream_t stream,
-                            memory::Buffer *buffer) {
+                            phi::memory_utils::Buffer *buffer) {
  void *d_temp_storage = nullptr;
  size_t temp_storage_bytes = 0;
  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Reduce(d_temp_storage,
@@ -1041,7 +1041,7 @@ static void GetSquareGradNormImpl(const T *grad,
                                  int n,
                                  float *square_norm,
                                  gpuStream_t stream,
-                                  memory::Buffer *cub_tmp_buffer) {
+                                  phi::memory_utils::Buffer *cub_tmp_buffer) {
  using Iterator =
      cub::TransformInputIterator<float, SquareFunctor<T>, const T *>;
  Iterator iter(grad, SquareFunctor<T>());
@@ -1061,7 +1061,7 @@ static void GetSquareGradNorm(const float *fp32_grad,
                              int fp16_numel,
                              float *square_norm,
                              gpuStream_t stream,
-                              memory::Buffer *cub_tmp_buffer) {
+                              phi::memory_utils::Buffer *cub_tmp_buffer) {
  VLOG(10) << "GetSquareGradNorm starts, fp32_numel = " << fp32_numel
           << " , fp16_numel = " << fp16_numel;
  if (fp32_numel > 0) {
@@ -1108,11 +1108,11 @@ static std::string GetMinMaxStr(const T *x,
      platform::DeviceContextPool::Instance().Get(place));
  auto stream = dev_ctx->stream();

-  memory::Buffer ret_buffer(place);
+  phi::memory_utils::Buffer ret_buffer(place);
  T *ret = ret_buffer.Alloc<T>(2);

  if (n > 0) {
-    memory::Buffer cub_buffer(place);
+    phi::memory_utils::Buffer cub_buffer(place);
    CubDeviceReduce(x,
                    ret,
                    n,
@@ -1197,8 +1197,8 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
  if (numel <= 0) return false;
  cub::TransformInputIterator<bool, IsNanInfFunctor<T>, const T *> iter(
      x, IsNanInfFunctor<T>());
-  memory::Buffer buffer(dev_ctx.GetPlace());
-  memory::Buffer out(dev_ctx.GetPlace());
+  phi::memory_utils::Buffer buffer(dev_ctx.GetPlace());
+  phi::memory_utils::Buffer out(dev_ctx.GetPlace());
  CubDeviceReduce(iter,
                  out.Alloc<bool>(1),
                  numel,
@@ -1230,7 +1230,7 @@ static void CheckHasNanInfGrad(const float *fp32_grad,
                               int fp16_numel,
                               float *nan_inf_flag,
                               gpuStream_t stream,
-                               memory::Buffer *cub_tmp_buffer) {
+                               phi::memory_utils::Buffer *cub_tmp_buffer) {
  bool *fp32_has_nan_inf = nullptr;
  bool *fp16_has_nan_inf = nullptr;
  if (fp32_numel > 0) {
@@ -1683,11 +1683,11 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
      }
    }

-    memory::Buffer grad_norm_square_buffer(place);
+    phi::memory_utils::Buffer grad_norm_square_buffer(place);
    auto *fp32_square_grad_norm = grad_norm_square_buffer.Alloc<float>(2);
-    memory::Buffer cub_tmp_buffer(place);
+    phi::memory_utils::Buffer cub_tmp_buffer(place);

-    memory::Buffer sum_grad_buffer(place);
+    phi::memory_utils::Buffer sum_grad_buffer(place);
    float *fp32_sum_grad;
    platform::float16 *fp16_sum_grad;
    auto fp32_numel_each_device = fp32_numel / num_devices;
@@ -2086,7 +2086,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
                               fp16_partial_fused_offsets_t->numel(),
                               fp16_partial_fused_offsets_t->place());

-    memory::Buffer trust_ratio_div_buffer(place);
+    phi::memory_utils::Buffer trust_ratio_div_buffer(place);
    auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
    auto fp32_offset = local_rank * fp32_numel_each_device;
    auto fp16_offset = local_rank * fp16_numel_each_device;
@@ -2149,7 +2149,7 @@ class DistributedFusedLambOpKernel<phi::GPUContext, T>
    VLOG(10) << "Update Moment and TrustRatioDiv done hehahaha";

    // Step 8: calculate L2-Norm square of parameter and trust_ratio_div
-    memory::Buffer square_norm_buffer(place);
+    phi::memory_utils::Buffer square_norm_buffer(place);
    auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
    auto *trust_ratio_div_square_norm = param_square_norm + param_num;
    if (num_devices > 1) {

--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
@@ -17,10 +17,12 @@
 #include <string>

 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/backends/custom/fake_cpu_device.h"
 #include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/common/memory_utils.h"

 void RegisterDevice() {
  CustomRuntimeParams runtime_params;
@@ -54,8 +56,7 @@ void InitDevice() {
  }
  EXPECT_GT(static_cast<int>(places.size()), 0);

-  paddle::platform::DeviceContextPool::Init(
-      places, paddle::platform::EmplaceExternalContext);
+  paddle::platform::DeviceContextPool::Init(places);
 }

 void TestDeviceInterface(const paddle::platform::Place& place) {
@@ -240,8 +241,8 @@ void TestCustomCCL(const paddle::platform::Place& place) {
 }

 TEST(CustomDevice, Tensor) {
-  InitDevice();
  paddle::framework::InitMemoryMethod();
+  InitDevice();
  auto dev_types = phi::DeviceManager::GetAllDeviceTypes();
  for (const auto& dev_type : dev_types) {
    std::cout << "Test on " << dev_type << std::endl;

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -38,8 +38,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif

-#include "paddle/phi/backends/context_pool_utils.h"
-
 namespace paddle {
 namespace platform {

@@ -64,73 +62,219 @@ DeviceType Place2DeviceType(const platform::Place& place) {
  }
 }

-void EmplaceExternalContext(
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DevCtx>
+typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
+                        DevCtx*>::type
+ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
+  return new DevCtx(p);
+}
+
+template <typename DevCtx>
+typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
+                        DevCtx*>::type
+ConstructDevCtx(const phi::Place& p, int stream_priority) {
+  return new DevCtx(p, /*init=*/true, stream_priority);
+}
+#else
+template <typename DevCtx>
+DevCtx* ConstructDevCtx(const phi::Place& p,
+                        /*unused*/ int stream_priority) {
+  return new DevCtx(p);
+}
+#endif
+
+template <typename DevCtx>
+inline std::unique_ptr<DeviceContext> CreateDeviceContext(
+    const phi::Place& p,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+
+  DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
+  auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
+  if (p.GetType() == phi::AllocationType::GPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ctx,
+        phi::errors::InvalidArgument(
+            "Failed to dynamic_cast dev_ctx into phi::GPUContext."));
+
+    if (!disable_setting_default_stream_for_allocator) {
+      instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
+    }
+    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
+    dev_ctx->SetPinnedAllocator(
+        instance.GetAllocator(phi::GPUPinnedPlace()).get());
+
+    cuda_ctx->PartialInitWithAllocator();
+    dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
+#endif
+  } else if (p.GetType() == phi::AllocationType::XPU) {
+#if defined(PADDLE_WITH_XPU)
+    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
+    dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
+#endif
+  } else {
+    dev_ctx->SetAllocator(instance.GetAllocator(p).get());
+    dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
+  }
+  dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
+  dev_ctx->SetHostAllocator(instance.GetAllocator(phi::CPUPlace()).get());
+  dev_ctx->SetZeroAllocator(instance.GetZeroAllocator(p).get());
+  dev_ctx->SetHostZeroAllocator(
+      instance.GetZeroAllocator(phi::CPUPlace()).get());
+  return PtrType(dev_ctx);
+}
+
+template <typename DevCtx>
+inline void EmplaceDeviceContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
-    const platform::Place& place,
+    const phi::Place& place,
    bool disable_setting_default_stream_for_allocator,
    int stream_priority) {
-  if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>(
+  // lazy evaluation. i.e., only create device context at first `Get`
+  place_to_device_context->emplace(
+      place,
+      std::async(std::launch::deferred,
+                 CreateDeviceContext<DevCtx>,
+                 place,
+                 disable_setting_default_stream_for_allocator,
+                 stream_priority));
+}
+
+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
+    const std::vector<phi::Place>& places,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  PADDLE_ENFORCE_GT(
+      places.size(),
+      0,
+      phi::errors::InvalidArgument("The number of platform places should "
+                                   "be larger than 0. But received %d.",
+                                   places.size()));
+  std::set<Place> set;
+  for (auto& p : places) {
+    set.insert(p);
+  }
+  for (auto& place : set) {
+    if (place.GetType() == phi::AllocationType::CPU) {
+#ifdef PADDLE_WITH_MKLDNN
+      EmplaceDeviceContext<phi::OneDNNContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #else
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "CUDAPlace is not supported. Please re-compile with WITH_GPU "
-        "option."));
+      EmplaceDeviceContext<phi::CPUContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #endif
-  } else if (platform::is_mlu_place(place)) {
+    } else if (place.GetType() == phi::AllocationType::GPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      EmplaceDeviceContext<phi::GPUContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          stream_priority);
+#else
+      PADDLE_THROW(
+          phi::errors::Unimplemented("GPUPlace is not supported. Please "
+                                     "re-compile with WITH_GPU option."));
+#endif
+    } else if (place.GetType() == phi::AllocationType::XPU) {
+#ifdef PADDLE_WITH_XPU
+      EmplaceDeviceContext<phi::XPUContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
+#else
+      PADDLE_THROW(
+          phi::errors::Unimplemented("XPUPlace is not supported. Please "
+                                     "re-compile with WITH_XPU option."));
+#endif
+    } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      EmplaceDeviceContext<phi::CustomContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
+#else
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "CustomPlace is not supported. Please re-compile with "
+          "WITH_CUSTOM_DEVICE "
+          "option."));
+#endif
+    } else if (platform::is_cuda_pinned_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      EmplaceDeviceContext<CUDAPinnedDeviceContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
+          "option."));
+#endif
+    } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
-    phi::EmplaceDeviceContext<MLUDeviceContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
+      EmplaceDeviceContext<MLUDeviceContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #else
-    PADDLE_THROW(
-        platform::errors::Unimplemented("MLUPlace is not supported. Please "
-                                        "re-compile with WITH_MLU option."));
+      PADDLE_THROW(
+          platform::errors::Unimplemented("MLUPlace is not supported. Please "
+                                          "re-compile with WITH_MLU option."));
 #endif
-  } else if (platform::is_ipu_place(place)) {
+    } else if (platform::is_ipu_place(place)) {
 #ifdef PADDLE_WITH_IPU
-    phi::EmplaceDeviceContext<IPUDeviceContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
+      EmplaceDeviceContext<IPUDeviceContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #else
-    PADDLE_THROW(
-        platform::errors::Unimplemented("IPUPlace is not supported. Please "
-                                        "re-compile with WITH_IPU option."));
+      PADDLE_THROW(
+          platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                          "re-compile with WITH_IPU option."));
 #endif
-  } else if (platform::is_npu_place(place)) {
+    } else if (platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-    phi::EmplaceDeviceContext<NPUDeviceContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
+      EmplaceDeviceContext<NPUDeviceContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #else
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "NPUPlace is not supported. Please "
-        "re-compile with WITH_ASCEND_CL option."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported. Please "
+          "re-compile with WITH_ASCEND_CL option."));
 #endif
-  } else if (platform::is_npu_pinned_place(place)) {
+    } else if (platform::is_npu_pinned_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-    phi::EmplaceDeviceContext<NPUPinnedDeviceContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
+      EmplaceDeviceContext<NPUPinnedDeviceContext>(
+          place_to_device_context,
+          place,
+          disable_setting_default_stream_for_allocator,
+          /*unused*/ stream_priority);
 #else
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "NPUPinnedPlace is not supported. Please re-compile with "
-        "WITH_ASCEND_CL "
-        "option."));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPinnedPlace is not supported. Please re-compile with "
+          "WITH_ASCEND_CL "
+          "option."));
 #endif
+    }
  }
 }


--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -241,15 +241,13 @@ using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 using CustomDeviceContext = phi::CustomContext;
 #endif

-void EmplaceExternalContext(
+void EmplaceDeviceContexts(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
-    const platform::Place& place,
+    const std::vector<phi::Place>& places,
    bool disable_setting_default_stream_for_allocator,
    int stream_priority);

-using phi::EmplaceDeviceContexts;
-
 using DeviceContextPool = phi::DeviceContextPool;

 }  // namespace platform

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -56,6 +56,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

+#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
@@ -274,7 +275,7 @@ void InitDevices(const std::vector<int> devices) {
    }
  }
 #endif
-  platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext);
+  platform::DeviceContextPool::Init(places);

 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);
@@ -472,6 +473,8 @@ void InitMemoryMethod() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
+    memory_method->emplace_device_contexts =
+        paddle::platform::EmplaceDeviceContexts;
    memory_method->init_devices = InitDevices;
    memory_utils.Init(std::move(memory_method));
  });

--- a/paddle/fluid/platform/init_phi.cc
+++ b/paddle/fluid/platform/init_phi.cc
@@ -21,7 +21,7 @@ namespace paddle {

 InitPhi::InitPhi() {
  paddle::framework::InitMemoryMethod();
-  LOG(INFO) << "Init MemoryMethod success.";
+  VLOG(4) << "Init MemoryMethod success.";
 }

 }  // namespace paddle
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -2,8 +2,14 @@ add_subdirectory(dynload)
 add_subdirectory(gpu)

 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
-set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
-set(BACKENDS_DEPS allocator generator)
+set(BACKENDS_DEPS
+    enforce
+    place
+    flags
+    eigen3
+    phi_device_context
+    generator
+    phi_os_info)
 if(WITH_XBYAK)
  list(APPEND BACKENDS_DEPS xbyak)
 endif()

--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/backends/context_pool.h"
-#include "paddle/phi/backends/context_pool_utils.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/enforce.h"

 namespace phi {

@@ -35,13 +36,9 @@ DeviceContextPool& DeviceContextPool::Instance() {
  return *pool;
 }

-EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
-    nullptr;
-
 /*! \brief  Create should only called by Init function */
 DeviceContextPool& DeviceContextPool::Init(
-    const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) {
-  emplace_external_context_func_ = func;
+    const std::vector<phi::Place>& places) {
  if (pool == nullptr) {
    pool = new DeviceContextPool(places);
  }
@@ -102,106 +99,12 @@ void DeviceContextPool::SetDeviceContexts(
  external_device_contexts_ = dev_ctxs;
 }

-inline void EmplaceNativeContext(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        place_to_device_context,
-    const phi::Place& place,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority) {
-  if (place.GetType() == phi::AllocationType::CPU) {
-#ifdef PADDLE_WITH_MKLDNN
-    EmplaceDeviceContext<phi::OneDNNContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
-#else
-    EmplaceDeviceContext<phi::CPUContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
-#endif
-  } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    EmplaceDeviceContext<phi::GPUContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        stream_priority);
-#else
-    PADDLE_THROW(
-        phi::errors::Unimplemented("GPUPlace is not supported. Please "
-                                   "re-compile with WITH_GPU option."));
-#endif
-  } else if (place.GetType() == phi::AllocationType::XPU) {
-#ifdef PADDLE_WITH_XPU
-    EmplaceDeviceContext<phi::XPUContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
-#else
-    PADDLE_THROW(
-        phi::errors::Unimplemented("XPUPlace is not supported. Please "
-                                   "re-compile with WITH_XPU option."));
-#endif
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-    EmplaceDeviceContext<phi::CustomContext>(
-        place_to_device_context,
-        place,
-        disable_setting_default_stream_for_allocator,
-        /*unused*/ stream_priority);
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "CustomPlace is not supported. Please re-compile with "
-        "WITH_CUSTOM_DEVICE "
-        "option."));
-#endif
-  }
-}
-
-void EmplaceDeviceContexts(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        place_to_device_context,
-    const std::vector<phi::Place>& places,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority,
-    EmplaceExternalContextFunc emplace_external_context_func) {
-  PADDLE_ENFORCE_GT(
-      places.size(),
-      0,
-      phi::errors::InvalidArgument("The number of platform places should "
-                                   "be larger than 0. But received %d.",
-                                   places.size()));
-
-  std::set<Place> set;
-  for (auto& p : places) {
-    set.insert(p);
-  }
-
-  for (auto& p : set) {
-    EmplaceNativeContext(place_to_device_context,
-                         p,
-                         disable_setting_default_stream_for_allocator,
-                         stream_priority);
-    if (emplace_external_context_func != nullptr) {
-      (*emplace_external_context_func)(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          stream_priority);
-    }
-  }
-}
-
 DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
-  EmplaceDeviceContexts(&device_contexts_,
-                        places,
-                        /*disable_setting_default_stream_for_allocator=*/false,
-                        /*stream_priority=*/0,
-                        emplace_external_context_func_);
+  phi::memory_utils::EmplaceDeviceContexts(
+      &device_contexts_,
+      places,
+      /*disable_setting_default_stream_for_allocator=*/false,
+      /*stream_priority=*/0);
 }

 }  // namespace phi
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -72,28 +72,13 @@ struct DefaultDeviceContextType<phi::CustomPlace> {
 };
 #endif

-using EmplaceExternalContextFunc = void (*)(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
-    const phi::Place&,
-    bool,
-    int);
-
-void EmplaceDeviceContexts(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        place_to_device_context,
-    const std::vector<phi::Place>& places,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority,
-    EmplaceExternalContextFunc emplace_external_context_func = nullptr);
-
 /*! \brief device context pool singleton */
 class DeviceContextPool {
 public:
  static DeviceContextPool& Instance();

  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Init(const std::vector<phi::Place>& places,
-                                 EmplaceExternalContextFunc func = nullptr);
+  static DeviceContextPool& Init(const std::vector<phi::Place>& places);

  static bool IsInitialized();

@@ -126,7 +111,6 @@ class DeviceContextPool {
  static thread_local const std::
      map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
          external_device_contexts_;  // not owned
-  static EmplaceExternalContextFunc emplace_external_context_func_;

  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };

--- a/paddle/phi/backends/context_pool_utils.h
+++ b/paddle/phi/backends/context_pool_utils.h
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/backends/all_context.h"
-
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/core/generator.h"
-
-namespace phi {
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DevCtx>
-typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
-                        DevCtx*>::type
-ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
-  return new DevCtx(p);
-}
-
-template <typename DevCtx>
-typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
-                        DevCtx*>::type
-ConstructDevCtx(const phi::Place& p, int stream_priority) {
-  return new DevCtx(p, /*init=*/true, stream_priority);
-}
-#else
-template <typename DevCtx>
-DevCtx* ConstructDevCtx(const phi::Place& p,
-                        /*unused*/ int stream_priority) {
-  return new DevCtx(p);
-}
-#endif
-
-template <typename DevCtx>
-inline std::unique_ptr<DeviceContext> CreateDeviceContext(
-    const phi::Place& p,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority) {
-  using PtrType = std::unique_ptr<DeviceContext>;
-
-  DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
-
-  if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
-    PADDLE_ENFORCE_NOT_NULL(
-        cuda_ctx,
-        phi::errors::InvalidArgument(
-            "Failed to dynamic_cast dev_ctx into phi::GPUContext."));
-
-    auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
-    if (!disable_setting_default_stream_for_allocator) {
-      instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
-    }
-    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
-    dev_ctx->SetPinnedAllocator(
-        instance.GetAllocator(phi::GPUPinnedPlace()).get());
-
-    cuda_ctx->PartialInitWithAllocator();
-    dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
-#endif
-  } else if (p.GetType() == phi::AllocationType::XPU) {
-#if defined(PADDLE_WITH_XPU)
-    dev_ctx->SetAllocator(
-        paddle::memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(p)
-            .get());
-    dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
-#endif
-  } else {
-    dev_ctx->SetAllocator(
-        paddle::memory::allocation::AllocatorFacade::Instance()
-            .GetAllocator(p)
-            .get());
-    dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
-  }
-  dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
-  dev_ctx->SetHostAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetAllocator(phi::CPUPlace())
-          .get());
-  dev_ctx->SetZeroAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetZeroAllocator(p)
-          .get());
-  dev_ctx->SetHostZeroAllocator(
-      paddle::memory::allocation::AllocatorFacade::Instance()
-          .GetZeroAllocator(phi::CPUPlace())
-          .get());
-  return PtrType(dev_ctx);
-}
-
-template <typename DevCtx>
-inline void EmplaceDeviceContext(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        place_to_device_context,
-    const phi::Place& place,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority) {
-  // lazy evaluation. i.e., only create device context at first `Get`
-  place_to_device_context->emplace(
-      place,
-      std::async(std::launch::deferred,
-                 CreateDeviceContext<DevCtx>,
-                 place,
-                 disable_setting_default_stream_for_allocator,
-                 stream_priority));
-}
-
-}  // namespace phi
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -28,6 +28,7 @@

 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -55,7 +56,7 @@ class CUDAGraphContextManager {

    DeviceContextMap &ctxs = cuda_graph_ctx_pool_[pool_id];
    if (ctxs.find(place) == ctxs.end()) {
-      EmplaceDeviceContexts(
+      phi::memory_utils::EmplaceDeviceContexts(
          &ctxs,
          {place},
          /*disable_setting_default_stream_for_allocator=*/true,

--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -77,6 +77,19 @@ void GpuMemoryUsage(size_t* available, size_t* total) {

 void InitDevices() { MemoryUtils::Instance().InitDevices(); }

+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<phi::Place>& places,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  MemoryUtils::Instance().EmplaceDeviceContexts(
+      place_to_device_context,
+      places,
+      disable_setting_default_stream_for_allocator,
+      stream_priority);
+}
+
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -14,8 +14,12 @@

 #pragma once

+#include <future>  // NOLINT
+#include <unordered_map>
+
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/macros.h"
 #include "paddle/phi/core/stream.h"
@@ -128,6 +132,24 @@ struct MemoryInterface {
   * @brief init devices info and device context
   */
  void (*init_devices)();
+
+  /**
+   * @brief create device_context by places and put them into
+   * place_to_device_context
+   *
+   * @param place_to_device_context the destination that device_context will be
+   * stored
+   * @param places the places that are related to device_context
+   * @param disable_setting_default_stream_for_allocator whether set default
+   * stream for allocator
+   * @param stream_priority set stream priority
+   */
+  void (*emplace_device_contexts)(
+      std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+          place_to_device_context,
+      const std::vector<phi::Place>& places,
+      bool disable_setting_default_stream_for_allocator,
+      int stream_priority);
 };

 class MemoryUtils {
@@ -271,12 +293,34 @@ class MemoryUtils {
    memory_method_->init_devices();
  }

+  void EmplaceDeviceContexts(
+      std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+          place_to_device_context,
+      const std::vector<phi::Place>& places,
+      bool disable_setting_default_stream_for_allocator,
+      int stream_priority) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NE(
+        memory_method_->emplace_device_contexts,
+        nullptr,
+        phi::errors::Unavailable(
+            "emplace_device_contexts method in memory_method_ is not "
+            "initiazed yet. You need init it first."));
+    memory_method_->emplace_device_contexts(
+        place_to_device_context,
+        places,
+        disable_setting_default_stream_for_allocator,
+        stream_priority);
+  }
+
  void CheckMemoryMethod() {
    PADDLE_ENFORCE_NE(
        memory_method_.get(),
        nullptr,
-        phi::errors::Unavailable("memory_method_ in MemoryUtils is not "
-                                 "initiazed yet. You need init it first."));
+        phi::errors::Unavailable(
+            "memory_method_ in MemoryUtils is not "
+            "initiazed yet. You need init it first. If you compiled with "
+            "Fluid. You can call InitMemoryMethod() for initialization."));
  }

 private:
@@ -334,6 +378,50 @@ void GpuMemoryUsage(size_t* available, size_t* total);

 void InitDevices();

+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<phi::Place>& places,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority);
+
+class Buffer {
+ public:
+  explicit Buffer(const phi::Place& place) : place_(place) {}
+
+  template <typename T>
+  T* Alloc(size_t size) {
+    using AllocT = typename std::
+        conditional<std::is_same<T, void>::value, uint8_t, T>::type;
+    if (UNLIKELY(size == 0)) return nullptr;
+    size *= sizeof(AllocT);
+    if (allocation_ == nullptr || allocation_->size() < size) {
+      allocation_ = memory_utils::Alloc(place_, size);
+    }
+    return reinterpret_cast<T*>(allocation_->ptr());
+  }
+
+  template <typename T>
+  const T* Get() const {
+    return reinterpret_cast<const T*>(
+        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
+  }
+
+  template <typename T>
+  T* GetMutable() {
+    return reinterpret_cast<T*>(
+        allocation_ && allocation_->size() > 0 ? allocation_->ptr() : nullptr);
+  }
+
+  size_t Size() const { return allocation_ ? allocation_->size() : 0; }
+
+  phi::Place GetPlace() const { return place_; }
+
+ private:
+  Allocator::AllocationPtr allocation_;
+  phi::Place place_;
+};
+
 }  // namespace memory_utils

 }  // namespace phi
--- a/paddle/phi/kernels/funcs/lamb_functors.h
+++ b/paddle/phi/kernels/funcs/lamb_functors.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <Eigen/Dense>
 #include <vector>

-#include "paddle/fluid/memory/buffer.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"

--- a/paddle/phi/kernels/funcs/squared_l2_norm.h
+++ b/paddle/phi/kernels/funcs/squared_l2_norm.h
@@ -14,7 +14,7 @@

 #pragma once

-#include "paddle/fluid/memory/buffer.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"

@@ -36,7 +36,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
                   const T1* x,
                   T2* y,
                   size_t numel,
-                   paddle::memory::Buffer* buffer = nullptr) {
+                   memory_utils::Buffer* buffer = nullptr) {
  if (std::is_same<T1, T2>::value) {
    using EigenT = typename phi::EigenTensor<T1, 1>::Type;
    using ConstEigenT = typename phi::EigenTensor<T1, 1>::ConstType;
@@ -60,9 +60,9 @@ void SquaredL2Norm(const phi::GPUContext& ctx,
                   const T1* x,
                   T2* y,
                   size_t numel,
-                   paddle::memory::Buffer* buffer = nullptr) {
+                   memory_utils::Buffer* buffer = nullptr) {
  if (UNLIKELY(buffer == nullptr)) {
-    paddle::memory::Buffer tmp_buffer(ctx.GetPlace());
+    memory_utils::Buffer tmp_buffer(ctx.GetPlace());
    return SquaredL2Norm(ctx, x, y, numel, &tmp_buffer);
  }


--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
@@ -240,7 +240,7 @@ void ComputeImpl(const Context& dev_ctx,

  // TODO(zengjinle): remove the following Eigen operations when
  // *skip_update == true.
-  paddle::memory::Buffer buffer(dev_ctx.GetPlace());
+  memory_utils::Buffer buffer(dev_ctx.GetPlace());
  phi::funcs::SquaredL2Norm(
      dev_ctx,
      reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr

--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
@@ -294,7 +294,7 @@ void ComputeRowImpl(const Context& dev_ctx,

  // TODO(zengjinle): remove the following Eigen operations when
  // *skip_update == true.
-  paddle::memory::Buffer buffer(dev_ctx.GetPlace());
+  memory_utils::Buffer buffer(dev_ctx.GetPlace());
  phi::funcs::SquaredL2Norm(
      dev_ctx,
      reinterpret_cast<const MT*>(IsMultiPrecision ? master_param_ptr

--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -96,8 +96,8 @@ int main(int argc, char** argv) {
  char** new_argv_address = new_argv.data();
  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(
      &new_argc, &new_argv_address, false);
-  paddle::framework::InitDevices();
  paddle::framework::InitMemoryMethod();
+  paddle::framework::InitDevices();
  paddle::framework::InitDefaultKernelSignatureMap();

  int ret = RUN_ALL_TESTS();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -211,10 +211,10 @@ def __bootstrap__():
        sys.argv = [""]
        core.init_glog(sys.argv[0])
    # don't init_p2p when in unittest to save time.
+    core.init_memory_method()
    core.init_devices()
    core.init_tensor_operants()
    core.init_default_kernel_signatures()
-    core.init_memory_method()


 # TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.