[phi decoupling] decouple dependency to device_context in phi (Part 1) (#50865)

* move DeviceContextPool to phi * add EmplaceExternalContextFunc * update namespace * update cmake * fix bugs and create context_pool_impl.h * replace platform::is_xxx_place * fix bugs * update generator * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix enforce usage * Revert "fix enforce usage" This reverts commit 5f521f08a69713cee506e64a00ec6d9fba709e27. * fix bugs * rm XPUDeviceContext and CustomDeviceContext * fix bugs * fix fix context init bug * fix bugs after merge * fix bugs * fix name * fix mutable_data * update and fix bugs * fix bugs * update * fix bugs * fix name * fix bugs * merge * fix bugs * create context_pool in phi/backends * create context_pool in phi/backends * fix bugs * fix xpu bugs * fix rocm bugs * fix bugs * fix bugs * fix bugs * fix xpu bugs * update * update * fix bugs * fix bugs

[phi decoupling] decouple dependency to device_context in phi (Part 1) (#50865)
* move DeviceContextPool to phi * add EmplaceExternalContextFunc * update namespace * update cmake * fix bugs and create context_pool_impl.h * replace platform::is_xxx_place * fix bugs * update generator * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix bugs * fix enforce usage * Revert "fix enforce usage" This reverts commit 5f521f08a69713cee506e64a00ec6d9fba709e27. * fix bugs * rm XPUDeviceContext and CustomDeviceContext * fix bugs * fix fix context init bug * fix bugs after merge * fix bugs * fix name * fix mutable_data * update and fix bugs * fix bugs * update * fix bugs * fix name * fix bugs * merge * fix bugs * create context_pool in phi/backends * create context_pool in phi/backends * fix bugs * fix xpu bugs * fix rocm bugs * fix bugs * fix bugs * fix bugs * fix xpu bugs * update * update * fix bugs * fix bugs
a1006b2b · Huang Jiyi · GitHub · 203a62b8 · a1006b2b · a1006b2b
143 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -94,4 +94,6 @@ paddle/fluid/pybind/op_function_impl.h
 paddle/fluid/pybind/*final_state_op_function_impl.h
 paddle/fluid/prim/api/generated/prim_api/*
 paddle/fluid/framework/__init__.py
+paddle/phi/api/profiler/__init__.py
 python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
+python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -165,7 +165,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
          conv_desc[i], CUDNN_DEFAULT_MATH));
 #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
-      if (!platform::allow_tf32_cudnn) {
+      if (!phi::allow_tf32_cudnn) {
        PADDLE_ENFORCE_GPU_SUCCESS(
            platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                           CUDNN_FMA_MATH));

--- a/paddle/fluid/operators/math/beam_search_xpu.cc
+++ b/paddle/fluid/operators/math/beam_search_xpu.cc
@@ -19,13 +19,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi

-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class XPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -159,7 +159,6 @@ cc_library(
       cudnn_workspace_helper
       ${XPU_CTX_DEPS}
       ${MLU_CTX_DEPS}
-       eigen3
       phi_backends
       phi_device_context
       generator)

--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"

 #include "paddle/fluid/memory/allocation/allocator_facade.h"
-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"

 DECLARE_bool(use_stream_safe_cuda_allocator);
 DECLARE_bool(new_executor_use_cuda_graph);

--- a/paddle/fluid/platform/device/custom/custom_device_test.cc
+++ b/paddle/fluid/platform/device/custom/custom_device_test.cc
@@ -54,7 +54,8 @@ void InitDevice() {
  }
  EXPECT_GT(static_cast<int>(places.size()), 0);

-  paddle::platform::DeviceContextPool::Init(places);
+  paddle::platform::DeviceContextPool::Init(
+      places, paddle::platform::EmplaceExternalContext);
 }

 void TestDeviceInterface(const paddle::platform::Place& place) {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <set>

 #include "glog/logging.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -39,19 +38,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif

+#include "paddle/phi/backends/context_pool_utils.h"
+
 namespace paddle {
 namespace platform {

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-bool allow_tf32_cublas = true;
-void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-
-bool allow_tf32_cudnn = true;
-void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
-#endif  // PADDLE_WITH_CUDA
-
 DeviceType Place2DeviceType(const platform::Place& place) {
  if (platform::is_cpu_place(place)) {
    return platform::DeviceType::CPU;
@@ -73,312 +64,76 @@ DeviceType Place2DeviceType(const platform::Place& place) {
  }
 }

-static DeviceContextPool* pool = nullptr;
-
-DeviceContextPool& DeviceContextPool::Instance() {
-  PADDLE_ENFORCE_NOT_NULL(pool,
-                          phi::errors::PreconditionNotMet(
-                              "Need to Create DeviceContextPool firstly!"));
-  return *pool;
-}
-
-/*! \brief  Create should only called by Init function */
-DeviceContextPool& DeviceContextPool::Init(
-    const std::vector<platform::Place>& places) {
-  if (pool == nullptr) {
-    pool = new DeviceContextPool(places);
-  }
-  return *pool;
-}
-
-bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
-
-void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
-  pool = dev_pool;
-}
-
-thread_local const std::map<Place,
-                            std::shared_future<std::unique_ptr<DeviceContext>>>*
-    DeviceContextPool::external_device_contexts_ = nullptr;
-
-platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
-  VLOG(6) << "DeviceContextPool Get: " << place;
-  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-      ptr;
-  if (external_device_contexts_ && external_device_contexts_->count(place)) {
-    ptr = external_device_contexts_;
-  } else {
-    ptr = &device_contexts_;
-  }
-
-  auto it = ptr->find(place);
-  if (it == ptr->end()) {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
-        "or check "
-        "that your train process set the correct device id if you use "
-        "Executor.",
-        place));
-  }
-  return it->second.get().get();
-}
-
-size_t DeviceContextPool::size() const {
-  if (external_device_contexts_) {
-    return external_device_contexts_->size();
-  }
-  return device_contexts_.size();
-}
-
-const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
-DeviceContextPool::device_contexts() const {
-  if (external_device_contexts_) {
-    return *external_device_contexts_;
-  }
-  return device_contexts_;
-}
-
-void DeviceContextPool::SetDeviceContexts(
-    const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        dev_ctxs) {
-  external_device_contexts_ = dev_ctxs;
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename DevCtx>
-typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
-                        DevCtx*>::type
-ConstructDevCtx(const platform::Place& p, /*unused*/ int stream_priority = 0) {
-  return new DevCtx(p);
-}
-
-template <typename DevCtx>
-typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
-                        DevCtx*>::type
-ConstructDevCtx(const platform::Place& p, int stream_priority) {
-  return new DevCtx(p, /*init=*/true, stream_priority);
-}
-#else
-template <typename DevCtx>
-DevCtx* ConstructDevCtx(const platform::Place& p,
-                        /*unused*/ int stream_priority) {
-  return new DevCtx(p);
-}
-#endif
-
-template <typename DevCtx>
-std::unique_ptr<DeviceContext> CreateDeviceContext(
-    const platform::Place& p,
-    bool disable_setting_default_stream_for_allocator = false,
-    int stream_priority = 0) {
-  using PtrType = std::unique_ptr<DeviceContext>;
-
-  DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
-
-  if (is_gpu_place(p)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
-    PADDLE_ENFORCE_NOT_NULL(
-        cuda_ctx,
-        platform::errors::InvalidArgument(
-            "Failed to dynamic_cast dev_ctx into phi::GPUContext."));
-
-    auto& instance = memory::allocation::AllocatorFacade::Instance();
-    if (!disable_setting_default_stream_for_allocator) {
-      instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream());
-    }
-    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
-    dev_ctx->SetPinnedAllocator(
-        instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get());
-
-    cuda_ctx->PartialInitWithAllocator();
-    dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
-#endif
-  } else if (is_xpu_place(p)) {
-#if defined(PADDLE_WITH_XPU)
-    dev_ctx->SetAllocator(
-        memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
-    dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
-#endif
-  } else {
-    dev_ctx->SetAllocator(
-        memory::allocation::AllocatorFacade::Instance().GetAllocator(p).get());
-    dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
-  }
-  dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
-  dev_ctx->SetHostAllocator(memory::allocation::AllocatorFacade::Instance()
-                                .GetAllocator(platform::CPUPlace())
-                                .get());
-  dev_ctx->SetZeroAllocator(memory::allocation::AllocatorFacade::Instance()
-                                .GetZeroAllocator(p)
-                                .get());
-  dev_ctx->SetHostZeroAllocator(memory::allocation::AllocatorFacade::Instance()
-                                    .GetZeroAllocator(platform::CPUPlace())
-                                    .get());
-  return PtrType(dev_ctx);
-}
-
-template <typename DevCtx>
-inline void EmplaceDeviceContext(
+void EmplaceExternalContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
-    platform::Place place,
+    const platform::Place& place,
    bool disable_setting_default_stream_for_allocator,
    int stream_priority) {
-  // lazy evaluation. i.e., only create device context at first `Get`
-  place_to_device_context->emplace(
-      place,
-      std::async(std::launch::deferred,
-                 CreateDeviceContext<DevCtx>,
-                 place,
-                 disable_setting_default_stream_for_allocator,
-                 stream_priority));
-}
-
-void EmplaceDeviceContexts(
-    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-        place_to_device_context,
-    const std::vector<platform::Place>& places,
-    bool disable_setting_default_stream_for_allocator,
-    int stream_priority) {
-  PADDLE_ENFORCE_GT(
-      places.size(),
-      0,
-      platform::errors::InvalidArgument("The number of platform places should "
-                                        "be larger than 0. But received %d.",
-                                        places.size()));
-
-  std::set<Place> set;
-  for (auto& p : places) {
-    set.insert(p);
-  }
-
-  for (auto& p : set) {
-    if (platform::is_cpu_place(p)) {
-#ifdef PADDLE_WITH_MKLDNN
-      EmplaceDeviceContext<phi::OneDNNContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      EmplaceDeviceContext<phi::CPUContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#endif
-    } else if (platform::is_gpu_place(p)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<phi::GPUContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          stream_priority);
-#else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
-                                          "re-compile with WITH_GPU option."));
-#endif
-    } else if (platform::is_cuda_pinned_place(p)) {
+  if (platform::is_cuda_pinned_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<CUDAPinnedDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
-          "option."));
-#endif
-    } else if (platform::is_xpu_place(p)) {
-#ifdef PADDLE_WITH_XPU
-      EmplaceDeviceContext<XPUDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
+    phi::EmplaceDeviceContext<CUDAPinnedDeviceContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
 #else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("XPUPlace is not supported. Please "
-                                          "re-compile with WITH_XPU option."));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "CUDAPlace is not supported. Please re-compile with WITH_GPU "
+        "option."));
 #endif
-    } else if (platform::is_mlu_place(p)) {
+  } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
-      EmplaceDeviceContext<MLUDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
+    phi::EmplaceDeviceContext<MLUDeviceContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
 #else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("MLUPlace is not supported. Please "
-                                          "re-compile with WITH_MLU option."));
+    PADDLE_THROW(
+        platform::errors::Unimplemented("MLUPlace is not supported. Please "
+                                        "re-compile with WITH_MLU option."));
 #endif
-    } else if (platform::is_ipu_place(p)) {
+  } else if (platform::is_ipu_place(place)) {
 #ifdef PADDLE_WITH_IPU
-      EmplaceDeviceContext<IPUDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
+    phi::EmplaceDeviceContext<IPUDeviceContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
 #else
-      PADDLE_THROW(
-          platform::errors::Unimplemented("IPUPlace is not supported. Please "
-                                          "re-compile with WITH_IPU option."));
+    PADDLE_THROW(
+        platform::errors::Unimplemented("IPUPlace is not supported. Please "
+                                        "re-compile with WITH_IPU option."));
 #endif
-    } else if (platform::is_npu_place(p)) {
+  } else if (platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
+    phi::EmplaceDeviceContext<NPUDeviceContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPlace is not supported. Please "
-          "re-compile with WITH_ASCEND_CL option."));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "NPUPlace is not supported. Please "
+        "re-compile with WITH_ASCEND_CL option."));
 #endif
-    } else if (platform::is_npu_pinned_place(p)) {
+  } else if (platform::is_npu_pinned_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      EmplaceDeviceContext<NPUPinnedDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
-#else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "NPUPinnedPlace is not supported. Please re-compile with "
-          "WITH_ASCEND_CL "
-          "option."));
-#endif
-    } else if (platform::is_custom_place(p)) {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-      EmplaceDeviceContext<CustomDeviceContext>(
-          place_to_device_context,
-          p,
-          disable_setting_default_stream_for_allocator,
-          /*unused*/ stream_priority);
+    phi::EmplaceDeviceContext<NPUPinnedDeviceContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "CustomPlace is not supported. Please re-compile with "
-          "WITH_CUSTOM_DEVICE "
-          "option."));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "NPUPinnedPlace is not supported. Please re-compile with "
+        "WITH_ASCEND_CL "
+        "option."));
 #endif
-    }
  }
 }

-DeviceContextPool::DeviceContextPool(
-    const std::vector<platform::Place>& places) {
-  EmplaceDeviceContexts(&device_contexts_,
-                        places,
-                        /*disable_setting_default_stream_for_allocator=*/false,
-                        /*stream_priority=*/0);
-}
-
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}

@@ -390,19 +145,6 @@ void IPUDeviceContext::Wait() const {

 IPUDeviceContext::~IPUDeviceContext() {}

-#endif
-#ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() : phi::XPUContext() {
-  phi::XPUContext::Init();
-}
-
-XPUDeviceContext::~XPUDeviceContext() {}
-
-XPUDeviceContext::XPUDeviceContext(XPUPlace place) : phi::XPUContext(place) {
-  phi::XPUContext::Init();
-  LOG_FIRST_N(WARNING, 1) << "Please NOTE: xpu device: "
-                          << static_cast<int>(place.device);
-}
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
@@ -469,14 +211,5 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
 const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif

-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-CustomDeviceContext::CustomDeviceContext(CustomPlace place)
-    : phi::CustomContext(place) {
-  Init();
-  stream_.reset(new phi::stream::Stream(place, stream()));
-}
-
-CustomDeviceContext::~CustomDeviceContext() {}
-#endif
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -23,6 +23,7 @@ limitations under the License. */

 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -98,18 +99,6 @@ struct GpuDevice;
 namespace paddle {
 namespace platform {

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-/*Set the value of the global variable allow_tf32_cublas*/
-void SetAllowTF32Cublas(bool active);
-/*Get the global variable allow_tf32_cublas value*/
-bool AllowTF32Cublas();
-extern bool allow_tf32_cudnn;
-/*Set the value of the global variable allow_tf32_cudnn*/
-void SetAllowTF32Cudnn(bool active);
-/*Get the global variable allow_tf32_cudnn value*/
-bool AllowTF32Cudnn();
-#endif  // PADDLE_WITH_CUDA
-
 enum DeviceType {
  CPU = 0,
  CUDA = 1,
@@ -134,14 +123,6 @@ constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;

 using DeviceContext = phi::DeviceContext;

-template <typename Place>
-struct DefaultDeviceContextType;
-
-template <>
-struct DefaultDeviceContextType<platform::CPUPlace> {
-  using TYPE = phi::CPUContext;
-};
-
 // Graphcore IPU
 #ifdef PADDLE_WITH_IPU
 class IPUDeviceContext
@@ -161,35 +142,15 @@ class IPUDeviceContext
 private:
  IPUPlace place_;
 };
-template <>
-struct DefaultDeviceContextType<platform::IPUPlace> {
-  using TYPE = IPUDeviceContext;
-};
 #endif

 #ifdef PADDLE_WITH_MLU
 class MLUDeviceContext;
-
-template <>
-struct DefaultDeviceContextType<platform::MLUPlace>;
 #endif

 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
-class XPUDeviceContext : public phi::XPUContext {
- public:
-  XPUDeviceContext();
-  explicit XPUDeviceContext(XPUPlace place);
-  virtual ~XPUDeviceContext();
-  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
-  void CreateStream() { XPUContext::CreateStream(); }
-};
-
-template <>
-struct DefaultDeviceContextType<platform::XPUPlace> {
-  using TYPE = XPUDeviceContext;
-};
+using XPUDeviceContext = phi::XPUContext;
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
@@ -251,11 +212,6 @@ class NPUDeviceContext
  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
 };

-template <>
-struct DefaultDeviceContextType<platform::NPUPlace> {
-  using TYPE = NPUDeviceContext;
-};
-
 // Currently, NPUPinnedDeviceContext is only used to data copying.
 class NPUPinnedDeviceContext
    : public DeviceContext,
@@ -275,19 +231,9 @@ class NPUPinnedDeviceContext
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };

-template <>
-struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
-  using TYPE = NPUPinnedDeviceContext;
-};
-
 #endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <>
-struct DefaultDeviceContextType<platform::CUDAPlace> {
-  using TYPE = phi::GPUContext;
-};
-
 // Currently, CUDAPinnedDeviceContext is only used to data copying.
 class CUDAPinnedDeviceContext
    : public DeviceContext,
@@ -306,90 +252,57 @@ class CUDAPinnedDeviceContext
  CUDAPinnedPlace place_;
  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
-
-template <>
-struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
-  using TYPE = CUDAPinnedDeviceContext;
-};
 #endif

 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-class CustomDeviceContext : public phi::CustomContext {
- public:
-  explicit CustomDeviceContext(CustomPlace place);
-  virtual ~CustomDeviceContext();
-
-  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
-
-  template <typename Callback>
-  void AddStreamCallback(Callback&& callback) const {
-    return stream_->AddCallback(callback);
-  }
-
-  void WaitStreamCallback() const { return stream_->WaitCallback(); }
-
- private:
-  std::shared_ptr<phi::stream::Stream> stream_;
-};
-template <>
-struct DefaultDeviceContextType<platform::CustomPlace> {
-  using TYPE = CustomDeviceContext;
-};
-#else
-template <>
-struct DefaultDeviceContextType<platform::CustomPlace> {
-  using TYPE = DeviceContext;
-};
+using CustomDeviceContext = phi::CustomContext;
 #endif

-void EmplaceDeviceContexts(
+void EmplaceExternalContext(
    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
        place_to_device_context,
-    const std::vector<platform::Place>& places,
+    const platform::Place& place,
    bool disable_setting_default_stream_for_allocator,
    int stream_priority);

-/*! \brief device context pool singleton */
-class DeviceContextPool {
- public:
-  static DeviceContextPool& Instance();
-
-  /*! \brief  Create should only called by Init function */
-  static DeviceContextPool& Init(const std::vector<platform::Place>& places);
+using phi::EmplaceDeviceContexts;

-  static bool IsInitialized();
+using DeviceContextPool = phi::DeviceContextPool;

-  static void SetPool(DeviceContextPool* dev_pool);
+}  // namespace platform
+}  // namespace paddle

-  /*! \brief  Return handle of single device context. */
-  platform::DeviceContext* Get(const platform::Place& place);
+namespace phi {

-  template <typename Place>
-  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
-      const Place& place) {
-    return reinterpret_cast<
-        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
-  }
+#ifdef PADDLE_WITH_IPU
+template <>
+struct DefaultDeviceContextType<phi::IPUPlace> {
+  using TYPE = paddle::platform::IPUDeviceContext;
+};
+#endif

-  size_t size() const;
+#ifdef PADDLE_WITH_MLU
+template <>
+struct DefaultDeviceContextType<phi::MLUPlace>;
+#endif

-  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
-  device_contexts() const;
+#ifdef PADDLE_WITH_ASCEND_CL
+template <>
+struct DefaultDeviceContextType<phi::NPUPlace> {
+  using TYPE = paddle::platform::NPUDeviceContext;
+};

-  static void SetDeviceContexts(
-      const std::map<Place,
-                     std::shared_future<std::unique_ptr<DeviceContext>>>*);
+template <>
+struct DefaultDeviceContextType<phi::NPUPinnedPlace> {
+  using TYPE = paddle::platform::NPUPinnedDeviceContext;
+};
+#endif

- private:
-  explicit DeviceContextPool(const std::vector<platform::Place>& places);
-
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
-      device_contexts_;
-  static thread_local const std::
-      map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
-          external_device_contexts_;  // not owned
-  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
+  using TYPE = paddle::platform::CUDAPinnedDeviceContext;
 };
+#endif

-}  // namespace platform
-}  // namespace paddle
+}  //  namespace phi
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -119,59 +119,6 @@ using namespace ::phi::enforce;  // NOLINT
 #define PADDLE_MAY_THROW noexcept(false)
 #endif

-/*
- * Summary: This macro is used to get Variable or internal type
- *   data (such as LoDTensor or SelectedRows) of the Input and
- *   Output in op, generally used when call scope.FindVar(Input/
- *   Output("Name")) or ctx.Input<LoDTensor>().
- *   Firstly this macro check whether the obtained pointer is null,
- *   and then return data if it is not null.
- *
- * Note: This macro is only suitable for specific scenarios and
- *   does not intended to be widely used. If it cannot meet the
- *   requirements, please use other PADDLE_ENFORCE** check macro.
- *
- * Parameters:
- *     __PTR: pointer
- *     __ROLE: (string), Input or Output
- *     __NAME: (string), Input or Output name
- *     __OP_TYPE: (string), the op type
- *
- * Return: The data pointed to by the pointer.
- *
- * Examples:
- *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
- */
-#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
-  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
-    auto* __ptr = (__PTR);                                              \
-    if (UNLIKELY(nullptr == __ptr)) {                                   \
-      auto __summary__ = phi::errors::NotFound(                         \
-          "Unable to get %s data of %s %s in operator %s. "             \
-          "Possible reasons are:\n"                                     \
-          "  1. The %s is not the %s of operator %s;\n"                 \
-          "  2. The %s has no corresponding variable passed in;\n"      \
-          "  3. The %s corresponding variable is not initialized.",     \
-          phi::demangle(                                                \
-              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
-                  .name()),                                             \
-          __ROLE,                                                       \
-          __NAME,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __ROLE,                                                       \
-          __OP_TYPE,                                                    \
-          __NAME,                                                       \
-          __NAME);                                                      \
-      auto __message__ = ::paddle::string::Sprintf(                     \
-          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
-          __summary__.error_message());                                 \
-      __THROW_ERROR_INTERNAL__(                                         \
-          phi::ErrorSummary(__summary__.code(), __message__));          \
-    }                                                                   \
-    return *__ptr;                                                      \
-  })())
-
 /*
 * Summary: This macro is used to check whether op has specified
 * Input or Output Variables. Because op's Input and Output

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -282,7 +282,7 @@ void InitDevices(const std::vector<int> devices) {
    }
  }
 #endif
-  platform::DeviceContextPool::Init(places);
+  platform::DeviceContextPool::Init(places, platform::EmplaceExternalContext);

 #ifndef PADDLE_WITH_MKLDNN
  platform::SetNumThreads(FLAGS_paddle_num_threads);

--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -27,7 +27,7 @@ TEST(InitDevices, CPU) {
    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU)
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U);
+  ASSERT_EQ(pool.Size(), 1U);
 #endif
 }

@@ -39,7 +39,7 @@ TEST(InitDevices, CUDA) {
  int count = paddle::platform::GetGPUDeviceCount();
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 2U + static_cast<unsigned>(count));
+  ASSERT_EQ(pool.Size(), 2U + static_cast<unsigned>(count));
 #endif
 }

@@ -51,7 +51,7 @@ TEST(InitDevices, XPU) {
  int count = paddle::platform::GetXPUDeviceCount();
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+  ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
 #endif
 }

@@ -63,7 +63,7 @@ TEST(InitDevices, MLU) {
  int count = paddle::platform::GetMLUDeviceCount();
  InitDevices();
  DeviceContextPool& pool = DeviceContextPool::Instance();
-  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+  ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
 #endif
 }


--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2526,10 +2526,10 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
-  m.def("get_cublas_switch", platform::AllowTF32Cublas);
-  m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
-  m.def("get_cudnn_switch", platform::AllowTF32Cudnn);
+  m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
+  m.def("get_cublas_switch", phi::AllowTF32Cublas);
+  m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
+  m.def("get_cudnn_switch", phi::AllowTF32Cudnn);
 #endif  // PADDLE_WITH_CUDA
  m.def("clear_executor_cache", []() {
    pybind11::gil_scoped_release release;

--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -91,7 +91,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
    phi::AddNInferMeta(x_metas, &meta_out);

    using kernel_signature =
-        void (*)(const platform::DeviceContext&,
+        void (*)(const phi::DeviceContext&,
                 const std::vector<const phi::SelectedRows*>&,
                 phi::SelectedRows*);
    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
@@ -119,7 +119,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
    phi::AddNInferMeta(x_metas, &meta_out);

    using kernel_signature =
-        void (*)(const platform::DeviceContext&,
+        void (*)(const phi::DeviceContext&,
                 const std::vector<const phi::TensorBase*>&,
                 phi::DenseTensor*);
    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
@@ -177,7 +177,7 @@ void embedding_grad_impl(const Tensor& x,
      meta_out.set_dtype(input_weight->dtype());
      kernel_out->set_height(input_weight->dims()[0]);

-      using kernel_signature = void (*)(const platform::DeviceContext&,
+      using kernel_signature = void (*)(const phi::DeviceContext&,
                                        const phi::DenseTensor&,
                                        const phi::DenseTensor&,
                                        const phi::DenseTensor&,
@@ -194,7 +194,7 @@ void embedding_grad_impl(const Tensor& x,
      auto* kernel_out = SetKernelOutput(weight_grad);
      phi::MetaTensor meta_out(kernel_out);
      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
-      using kernel_signature = void (*)(const platform::DeviceContext&,
+      using kernel_signature = void (*)(const phi::DeviceContext&,
                                        const phi::DenseTensor&,
                                        const phi::DenseTensor&,
                                        const phi::DenseTensor&,
@@ -229,7 +229,7 @@ void embedding_grad_impl(const Tensor& x,
      auto* kernel_out = SetSelectedRowsKernelOutput(weight_grad);
      phi::MetaTensor meta_out(kernel_out);
      phi::UnchangedInferMeta(MakeMetaTensor(*input_weight), &meta_out);
-      using kernel_signature = void (*)(const platform::DeviceContext&,
+      using kernel_signature = void (*)(const phi::DeviceContext&,
                                        const phi::DenseTensor&,
                                        const phi::SelectedRows&,
                                        const phi::DenseTensor&,
@@ -247,7 +247,7 @@ void embedding_grad_impl(const Tensor& x,
      phi::MetaTensor meta_out(kernel_out);
      meta_out.set_dims(input_weight->GetCompleteDims());
      meta_out.set_dtype(input_weight->dtype());
-      using kernel_signature = void (*)(const platform::DeviceContext&,
+      using kernel_signature = void (*)(const phi::DeviceContext&,
                                        const phi::DenseTensor&,
                                        const phi::SelectedRows&,
                                        const phi::DenseTensor&,

--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/api/include/context_pool.h"

-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"

@@ -35,11 +35,11 @@ DeviceContextPool& DeviceContextPool::Instance() {
 const phi::DeviceContext* DeviceContextPool::Get(const Place& place) {
  auto it = context_map_.find(place);
  if (it == context_map_.end()) {
-    if (!paddle::platform::DeviceContextPool::IsInitialized()) {
+    if (!phi::DeviceContextPool::IsInitialized()) {
      paddle::framework::InitDevices();
    }
    // only when we need the specific DeviceContext, get and cache it
-    auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
+    auto* dev_ctx = phi::DeviceContextPool::Instance().Get(place);
    {
      std::lock_guard<std::mutex> lock(mutex_);
      context_map_[place] = dev_ctx;

--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/cast_kernel.h"
@@ -33,7 +33,7 @@ inline bool NeedTransformDataType(const DataType& input,
          target == DataType::COMPLEX64 || target == DataType::COMPLEX128);
 }

-inline bool NeedTransformPlace(const paddle::platform::Place& input,
+inline bool NeedTransformPlace(const phi::Place& input,
                               const Backend& target,
                               const TransformFlag& transform_flag) {
  // NOTE(dev): The default value of TransformFlag is True, if it is set with
@@ -52,12 +52,12 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,

 inline bool NeedTransformLayout(const DataLayout& input,
                                const DataLayout& target,
-                                const paddle::platform::Place& place,
+                                const phi::Place& place,
                                const TransformFlag& transform_flag) {
  bool ret = transform_flag.need_trans_layout() &&
             (input != DataLayout::ALL_LAYOUT &&
              target != DataLayout::ALL_LAYOUT && input != target);
-  if (platform::is_gpu_place(place)) {
+  if (place.GetType() == phi::AllocationType::GPU) {
    return false;
  }
  return ret;
@@ -65,10 +65,10 @@ inline bool NeedTransformLayout(const DataLayout& input,

 inline phi::DenseTensor TransDataLayout(const phi::DenseTensor& tensor,
                                        DataLayout layout) {
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto& pool = phi::DeviceContextPool::Instance();
  VLOG(3) << "DataLayoutTransform src_layout: " << tensor.layout()
          << " dst_layout: " << layout;
-  if (platform::is_cpu_place(tensor.place())) {
+  if (tensor.place().GetType() == phi::AllocationType::CPU) {
    auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
    return phi::TransferLayout(*dev_ctx, tensor, layout);
  } else {
@@ -139,7 +139,7 @@ phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,

 inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
                                      DataType dtype) {
-  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto& pool = phi::DeviceContextPool::Instance();

  VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype()
          << " dst_dtype: " << dtype;
@@ -147,11 +147,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
  DefaultAllocator alloc(tensor.place());
  phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()});

-  if (platform::is_cpu_place(tensor.place())) {
+  if (tensor.place().GetType() == phi::AllocationType::CPU) {
    auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
    return CastDataType(*dev_ctx, tensor, dtype);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  } else if (platform::is_gpu_place(tensor.place())) {
+  } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
    auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
    return CastDataType(*dev_ctx, tensor, dtype);
 #endif
@@ -170,7 +170,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
  auto& pool = phi::DeviceContextPool::Instance();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // NOTE(yy): TransDataPlace should wait for computation of input.
-  if (!platform::is_cuda_pinned_place(tensor.place())) {
+  if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
    pool.Get(tensor.place())->Wait();
    pool.Get(dst_place)->Wait();
  }

--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -112,7 +112,7 @@ DataType ParseDataType(const std::vector<Tensor>& tensors) {
  auto n = tensors.size();
  for (size_t i = 1; i < n; ++i) {
    if (tensors[i].type() != dtype) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
          "The data_type of input tensor in list isn't consistent, "
          "the first tensor is %s, but %dth tensor is %s.",
          dtype,

--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -872,7 +872,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
        sr_out_trans_map = {'Tensor': 'phi::SelectedRows*'}
        input_names = self.inputs['names']
        input_infos = self.inputs['input_info']
-        kernel_args_type_list = ['const platform::DeviceContext&']
+        kernel_args_type_list = ['const phi::DeviceContext&']

        attr_names = self.attrs['names']
        kernel_param = self.kernel['param']

--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -127,7 +127,7 @@ class StringsAPI(ForwardAPI):
        }
        input_names = self.inputs['names']
        input_infos = self.inputs['input_info']
-        kernel_args_type_list = ['const platform::DeviceContext&']
+        kernel_args_type_list = ['const phi::DeviceContext&']

        attr_names = self.attrs['names']
        kernel_param = self.kernel['param']

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(gpu)

 set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
 set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
+set(BACKENDS_DEPS allocator generator)
 if(WITH_XBYAK)
  list(APPEND BACKENDS_DEPS xbyak)
 endif()
@@ -45,7 +46,8 @@ list(
  stream.cc
  event.cc
  device_base.cc
-  device_manager.cc)
+  device_manager.cc
+  context_pool.cc)

 if(WITH_CUSTOM_DEVICE)
  list(APPEND BACKENDS_SRCS custom/custom_context.cc custom/custom_device.cc
@@ -54,7 +56,6 @@ endif()

 add_library(phi_backends "${BACKENDS_SRCS}")
 target_link_libraries(phi_backends ${BACKENDS_DEPS})
-add_dependencies(phi_backends eigen3)

 # for inference library
 get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)

--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -26,11 +26,4 @@ limitations under the License. */
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"

-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-// TODO(wilber): DeviceContextPool nees include fluid file.
-#include "paddle/fluid/platform/device_context.h"
-
-namespace phi {
-using DeviceContextPool = paddle::platform::DeviceContextPool;
-}  // namespace phi
-#endif
+namespace phi {}  // namespace phi
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/context_pool_utils.h"
+
+namespace phi {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+bool allow_tf32_cublas = true;
+void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool allow_tf32_cudnn = true;
+void SetAllowTF32Cudnn(bool active) { allow_tf32_cudnn = active; }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
+#endif  // PADDLE_WITH_CUDA
+
+static DeviceContextPool* pool = nullptr;
+
+DeviceContextPool& DeviceContextPool::Instance() {
+  PADDLE_ENFORCE_NOT_NULL(pool,
+                          phi::errors::PreconditionNotMet(
+                              "Need to Create DeviceContextPool firstly!"));
+  return *pool;
+}
+
+EmplaceExternalContextFunc DeviceContextPool::emplace_external_context_func_ =
+    nullptr;
+
+/*! \brief  Create should only called by Init function */
+DeviceContextPool& DeviceContextPool::Init(
+    const std::vector<phi::Place>& places, EmplaceExternalContextFunc func) {
+  emplace_external_context_func_ = func;
+  if (pool == nullptr) {
+    pool = new DeviceContextPool(places);
+  }
+  return *pool;
+}
+
+bool DeviceContextPool::IsInitialized() { return pool != nullptr; }
+
+void DeviceContextPool::SetPool(DeviceContextPool* dev_pool) {
+  pool = dev_pool;
+}
+
+thread_local const std::map<Place,
+                            std::shared_future<std::unique_ptr<DeviceContext>>>*
+    DeviceContextPool::external_device_contexts_ = nullptr;
+
+phi::DeviceContext* DeviceContextPool::Get(const phi::Place& place) {
+  VLOG(6) << "DeviceContextPool Get: " << place;
+  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+      ptr;
+  if (external_device_contexts_ && external_device_contexts_->count(place)) {
+    ptr = external_device_contexts_;
+  } else {
+    ptr = &device_contexts_;
+  }
+
+  auto it = ptr->find(place);
+  if (it == ptr->end()) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Place %s is not supported. Please check that your paddle compiles "
+        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
+        "or check "
+        "that your train process set the correct device id if you use "
+        "Executor.",
+        place));
+  }
+  return it->second.get().get();
+}
+
+size_t DeviceContextPool::Size() const {
+  if (external_device_contexts_) {
+    return external_device_contexts_->size();
+  }
+  return device_contexts_.size();
+}
+
+const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
+DeviceContextPool::device_contexts() const {
+  if (external_device_contexts_) {
+    return *external_device_contexts_;
+  }
+  return device_contexts_;
+}
+
+void DeviceContextPool::SetDeviceContexts(
+    const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        dev_ctxs) {
+  external_device_contexts_ = dev_ctxs;
+}
+
+inline void EmplaceNativeContext(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const phi::Place& place,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  if (place.GetType() == phi::AllocationType::CPU) {
+#ifdef PADDLE_WITH_MKLDNN
+    EmplaceDeviceContext<phi::OneDNNContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
+#else
+    EmplaceDeviceContext<phi::CPUContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
+#endif
+  } else if (place.GetType() == phi::AllocationType::GPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    EmplaceDeviceContext<phi::GPUContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        stream_priority);
+#else
+    PADDLE_THROW(
+        phi::errors::Unimplemented("GPUPlace is not supported. Please "
+                                   "re-compile with WITH_GPU option."));
+#endif
+  } else if (place.GetType() == phi::AllocationType::XPU) {
+#ifdef PADDLE_WITH_XPU
+    EmplaceDeviceContext<phi::XPUContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
+#else
+    PADDLE_THROW(
+        phi::errors::Unimplemented("XPUPlace is not supported. Please "
+                                   "re-compile with WITH_XPU option."));
+#endif
+  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    EmplaceDeviceContext<phi::CustomContext>(
+        place_to_device_context,
+        place,
+        disable_setting_default_stream_for_allocator,
+        /*unused*/ stream_priority);
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "CustomPlace is not supported. Please re-compile with "
+        "WITH_CUSTOM_DEVICE "
+        "option."));
+#endif
+  }
+}
+
+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<phi::Place>& places,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority,
+    EmplaceExternalContextFunc emplace_external_context_func) {
+  PADDLE_ENFORCE_GT(
+      places.size(),
+      0,
+      phi::errors::InvalidArgument("The number of platform places should "
+                                   "be larger than 0. But received %d.",
+                                   places.size()));
+
+  std::set<Place> set;
+  for (auto& p : places) {
+    set.insert(p);
+  }
+
+  for (auto& p : set) {
+    EmplaceNativeContext(place_to_device_context,
+                         p,
+                         disable_setting_default_stream_for_allocator,
+                         stream_priority);
+    if (emplace_external_context_func != nullptr) {
+      (*emplace_external_context_func)(
+          place_to_device_context,
+          p,
+          disable_setting_default_stream_for_allocator,
+          stream_priority);
+    }
+  }
+}
+
+DeviceContextPool::DeviceContextPool(const std::vector<phi::Place>& places) {
+  EmplaceDeviceContexts(&device_contexts_,
+                        places,
+                        /*disable_setting_default_stream_for_allocator=*/false,
+                        /*stream_priority=*/0,
+                        emplace_external_context_func_);
+}
+
+}  // namespace phi
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <future>  // NOLINT
+#include <map>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <set>
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void SetAllowTF32Cublas(bool active);
+/*Get the global variable allow_tf32_cublas value*/
+bool AllowTF32Cublas();
+extern bool allow_tf32_cudnn;
+/*Set the value of the global variable allow_tf32_cudnn*/
+void SetAllowTF32Cudnn(bool active);
+/*Get the global variable allow_tf32_cudnn value*/
+bool AllowTF32Cudnn();
+#endif  // PADDLE_WITH_CUDA
+
+template <typename Place>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<phi::CPUPlace> {
+  using TYPE = phi::CPUContext;
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <>
+struct DefaultDeviceContextType<phi::GPUPlace> {
+  using TYPE = phi::GPUContext;
+};
+#endif
+
+#ifdef PADDLE_WITH_XPU
+template <>
+struct DefaultDeviceContextType<phi::XPUPlace> {
+  using TYPE = phi::XPUContext;
+};
+#endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <>
+struct DefaultDeviceContextType<phi::CustomPlace> {
+  using TYPE = phi::CustomContext;
+};
+#else
+template <>
+struct DefaultDeviceContextType<phi::CustomPlace> {
+  using TYPE = DeviceContext;
+};
+#endif
+
+using EmplaceExternalContextFunc = void (*)(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*,
+    const phi::Place&,
+    bool,
+    int);
+
+void EmplaceDeviceContexts(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const std::vector<phi::Place>& places,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority,
+    EmplaceExternalContextFunc emplace_external_context_func = nullptr);
+
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  static DeviceContextPool& Instance();
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Init(const std::vector<phi::Place>& places,
+                                 EmplaceExternalContextFunc func = nullptr);
+
+  static bool IsInitialized();
+
+  static void SetPool(DeviceContextPool* dev_pool);
+
+  /*! \brief  Return handle of single device context. */
+  phi::DeviceContext* Get(const phi::Place& place);
+
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
+
+  size_t Size() const;
+
+  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
+  device_contexts() const;
+
+  static void SetDeviceContexts(
+      const std::map<Place,
+                     std::shared_future<std::unique_ptr<DeviceContext>>>*);
+
+ private:
+  explicit DeviceContextPool(const std::vector<phi::Place>& places);
+
+  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
+      device_contexts_;
+  static thread_local const std::
+      map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+          external_device_contexts_;  // not owned
+  static EmplaceExternalContextFunc emplace_external_context_func_;
+
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
+}  // namespace phi
--- a/paddle/phi/backends/context_pool_utils.h
+++ b/paddle/phi/backends/context_pool_utils.h
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/core/generator.h"
+
+namespace phi {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DevCtx>
+typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
+                        DevCtx*>::type
+ConstructDevCtx(const phi::Place& p, /*unused*/ int stream_priority = 0) {
+  return new DevCtx(p);
+}
+
+template <typename DevCtx>
+typename std::enable_if<std::is_same<DevCtx, phi::GPUContext>::value,
+                        DevCtx*>::type
+ConstructDevCtx(const phi::Place& p, int stream_priority) {
+  return new DevCtx(p, /*init=*/true, stream_priority);
+}
+#else
+template <typename DevCtx>
+DevCtx* ConstructDevCtx(const phi::Place& p,
+                        /*unused*/ int stream_priority) {
+  return new DevCtx(p);
+}
+#endif
+
+template <typename DevCtx>
+inline std::unique_ptr<DeviceContext> CreateDeviceContext(
+    const phi::Place& p,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+
+  DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
+
+  if (p.GetType() == phi::AllocationType::GPU) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
+    PADDLE_ENFORCE_NOT_NULL(
+        cuda_ctx,
+        phi::errors::InvalidArgument(
+            "Failed to dynamic_cast dev_ctx into phi::GPUContext."));
+
+    auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
+    if (!disable_setting_default_stream_for_allocator) {
+      instance.SetDefaultStream(GPUPlace(p.GetDeviceId()), cuda_ctx->stream());
+    }
+    dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get());
+    dev_ctx->SetPinnedAllocator(
+        instance.GetAllocator(phi::GPUPinnedPlace()).get());
+
+    cuda_ctx->PartialInitWithAllocator();
+    dev_ctx->SetGenerator(phi::DefaultCUDAGenerator(p.GetDeviceId()).get());
+#endif
+  } else if (p.GetType() == phi::AllocationType::XPU) {
+#if defined(PADDLE_WITH_XPU)
+    dev_ctx->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(p)
+            .get());
+    dev_ctx->SetGenerator(phi::DefaultXPUGenerator(p.GetDeviceId()).get());
+#endif
+  } else {
+    dev_ctx->SetAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(p)
+            .get());
+    dev_ctx->SetGenerator(phi::DefaultCPUGenerator().get());
+  }
+  dev_ctx->SetHostGenerator(phi::DefaultCPUGenerator().get());
+  dev_ctx->SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx->SetZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(p)
+          .get());
+  dev_ctx->SetHostZeroAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetZeroAllocator(phi::CPUPlace())
+          .get());
+  return PtrType(dev_ctx);
+}
+
+template <typename DevCtx>
+inline void EmplaceDeviceContext(
+    std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        place_to_device_context,
+    const phi::Place& place,
+    bool disable_setting_default_stream_for_allocator,
+    int stream_priority) {
+  // lazy evaluation. i.e., only create device context at first `Get`
+  place_to_device_context->emplace(
+      place,
+      std::async(std::launch::deferred,
+                 CreateDeviceContext<DevCtx>,
+                 place,
+                 disable_setting_default_stream_for_allocator,
+                 stream_priority));
+}
+
+}  // namespace phi
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -66,8 +66,10 @@ void CustomContext::SetStream(std::shared_ptr<phi::stream::Stream> stream) {
 void CustomContext::Wait() const { return impl_->Wait(); }

 CustomContext::CustomContext(const CustomPlace& place)
-    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {
+  impl_->Init();
+}

-CustomContext::~CustomContext() {}
+CustomContext::~CustomContext() { impl_->Init(); }

 }  // namespace phi
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -20,6 +20,10 @@ limitations under the License. */
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"

+namespace Eigen {
+struct DefaultDevice;
+}  // namespace Eigen
+
 namespace phi {

 class CustomContext : public DeviceContext,
@@ -42,6 +46,15 @@ class CustomContext : public DeviceContext,
  // Wait for all operations completion in the stream.
  void Wait() const override;

+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return GetStream()->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const { return GetStream()->WaitCallback(); }
+
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+
  static const char* name() { return "CustomContext"; }

 public:

--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/platform/device_context.h"
-
+#include "paddle/fluid/platform/profiler/trace_event_collector.h"
 #include "paddle/phi/backends/callback_manager.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/enforce_custom.h"
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_guard.h"
@@ -285,8 +285,7 @@ class CustomDevice : public DeviceInterface {
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->async_memory_copy_h2d(device, c_stream, dst, src, size));
    } else if (pimpl_->memory_copy_h2d) {
-      paddle::platform::DeviceContextPool& pool =
-          paddle::platform::DeviceContextPool::Instance();
+      phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
      pool.Get(place)->Wait();
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->memory_copy_h2d(device, dst, src, size));
@@ -306,8 +305,7 @@ class CustomDevice : public DeviceInterface {
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->async_memory_copy_d2h(device, c_stream, dst, src, size));
    } else if (pimpl_->memory_copy_d2h) {
-      paddle::platform::DeviceContextPool& pool =
-          paddle::platform::DeviceContextPool::Instance();
+      phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
      pool.Get(place)->Wait();
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->memory_copy_d2h(device, dst, src, size));
@@ -327,8 +325,7 @@ class CustomDevice : public DeviceInterface {
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->async_memory_copy_d2d(device, c_stream, dst, src, size));
    } else if (pimpl_->memory_copy_d2d) {
-      paddle::platform::DeviceContextPool& pool =
-          paddle::platform::DeviceContextPool::Instance();
+      phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
      pool.Get(place)->Wait();
      PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
          pimpl_->memory_copy_d2d(device, dst, src, size));
@@ -364,8 +361,7 @@ class CustomDevice : public DeviceInterface {
        MemoryCopyH2D(dst_dev_id, dst, tmp.get(), size);
      } else {
        auto src_place = CustomPlace(Type(), src_dev_id);
-        paddle::platform::DeviceContextPool& pool =
-            paddle::platform::DeviceContextPool::Instance();
+        phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
        pool.Get(src_place)->Wait();
        PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(
            pimpl_->memory_copy_p2p(dst_device, src_device, dst, src, size));

--- a/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h
@@ -18,7 +18,7 @@
 #include <utility>

 #ifdef PADDLE_WITH_CUDA
-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
 #endif

--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/flat_hash_map.h"

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/expect.h"

 namespace phi {
@@ -42,8 +42,7 @@ OneDNNContextThreadLocals::Body::~Body() {
  auto cpu_place = phi::CPUPlace();
  // TODO(YuanRisheng): we need remove the dependency on fluid device context
  // here
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
  OneDNNContext* dev_ctx = static_cast<OneDNNContext*>(pool.Get(cpu_place));
  dev_ctx->ResetBlobMap(exec_ptr_);
 }

--- a/paddle/phi/backends/processgroup_comm_utils.cc
+++ b/paddle/phi/backends/processgroup_comm_utils.cc
@@ -44,14 +44,14 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
    return nullptr;
  }
 #endif
-  if (paddle::platform::is_gpu_place(place)) {
+  if (place.GetType() == phi::AllocationType::GPU) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm(
        place);
 #else
    return nullptr;
 #endif
-  } else if (paddle::platform::is_custom_place(place)) {
+  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
    return static_cast<paddle::distributed::ProcessGroupCustom*>(pg)
        ->CustomCCLComm(place);

--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -125,10 +125,14 @@ struct XPUContext::Impl {
  xpu::BKCLContext_t bkcl_context_{nullptr};
 };

-XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {}
+XPUContext::XPUContext() : DeviceContext(), impl_(std::make_unique<Impl>()) {
+  impl_->Init();
+}

 XPUContext::XPUContext(const XPUPlace& place)
-    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {}
+    : DeviceContext(), impl_(std::make_unique<Impl>(place)) {
+  impl_->Init();
+}

 XPUContext::~XPUContext() = default;


--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -24,6 +24,10 @@ limitations under the License. */
 #include "paddle/phi/core/device_context.h"
 #include "xpu/runtime.h"

+namespace Eigen {
+struct DefaultDevice;
+}  // namespace Eigen
+
 namespace xpu = baidu::xpu::api;

 namespace phi {
@@ -65,6 +69,8 @@ class XPUContext : public DeviceContext,

  void SetL3Cache(int l3_size = 14155776);

+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+
  XPUStream stream() const;

  static const char* name() { return "XPUContext"; }

--- a/paddle/phi/common/int_array.cc
+++ b/paddle/phi/common/int_array.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/common/int_array.h"

-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/tensor_utils.h"

--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/common/scalar.h"

-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"

--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -118,7 +118,7 @@ cc_library(
 cc_library(
  mixed_vector
  SRCS mixed_vector.cc
-  DEPS device_context place memory)
+  DEPS phi_backends place memory)

 cc_library(
  generator
@@ -135,24 +135,20 @@ if(WITH_GPU)
  nv_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy device_context
-         memory_utils)
+    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
 elseif(WITH_ROCM)
  hip_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy device_context
-         memory_utils)
+    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
 elseif(WITH_XPU_KP)
  xpu_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS phi_backends dense_tensor selected_rows memcpy device_context
-         memory_utils)
+    DEPS phi_backends dense_tensor selected_rows memcpy memory_utils)
 else()
  cc_library(
    phi_tensor_utils
    SRCS tensor_utils.cc
-    DEPS dense_tensor selected_rows memcpy device_context phi_backends
-         memory_utils)
+    DEPS dense_tensor selected_rows memcpy phi_backends memory_utils)
 endif()
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -468,6 +468,59 @@ struct EnforceNotMet : public std::exception {

 /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/

+/*
+ * Summary: This macro is used to get Variable or internal type
+ *   data (such as LoDTensor or SelectedRows) of the Input and
+ *   Output in op, generally used when call scope.FindVar(Input/
+ *   Output("Name")) or ctx.Input<LoDTensor>().
+ *   Firstly this macro check whether the obtained pointer is null,
+ *   and then return data if it is not null.
+ *
+ * Note: This macro is only suitable for specific scenarios and
+ *   does not intended to be widely used. If it cannot meet the
+ *   requirements, please use other PADDLE_ENFORCE** check macro.
+ *
+ * Parameters:
+ *     __PTR: pointer
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
+ *
+ * Return: The data pointed to by the pointer.
+ *
+ * Examples:
+ *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
+ */
+#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)               \
+  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {      \
+    auto* __ptr = (__PTR);                                              \
+    if (UNLIKELY(nullptr == __ptr)) {                                   \
+      auto __summary__ = phi::errors::NotFound(                         \
+          "Unable to get %s data of %s %s in operator %s. "             \
+          "Possible reasons are:\n"                                     \
+          "  1. The %s is not the %s of operator %s;\n"                 \
+          "  2. The %s has no corresponding variable passed in;\n"      \
+          "  3. The %s corresponding variable is not initialized.",     \
+          phi::demangle(                                                \
+              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
+                  .name()),                                             \
+          __ROLE,                                                       \
+          __NAME,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __ROLE,                                                       \
+          __OP_TYPE,                                                    \
+          __NAME,                                                       \
+          __NAME);                                                      \
+      auto __message__ = ::paddle::string::Sprintf(                     \
+          "%s\n  [Hint: pointer " #__PTR " should not be null.]",       \
+          __summary__.error_message());                                 \
+      __THROW_ERROR_INTERNAL__(                                         \
+          phi::ErrorSummary(__summary__.code(), __message__));          \
+    }                                                                   \
+    return *__ptr;                                                      \
+  })())
+
 /*
 * Summary: This PADDLE_GET(_**) series macros are used to call paddle::get
 *   safely. paddle::get is not a completely safe api, although it will not

--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <vector>

 #include "glog/logging.h"
-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/utils/none.h"
 #include "paddle/utils/optional.h"

--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -14,14 +14,13 @@ limitations under the License. */

 #include "paddle/phi/core/tensor_utils.h"

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

-#include "paddle/fluid/platform/device_context.h"
-
 namespace phi {

 template <typename Context>

--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -62,10 +62,10 @@ void IndexSelectInner(const Context& ctx,
  auto index_size = index.dims()[0];

  DenseTensor index_cpu_copy;
-  if (!paddle::platform::is_cpu_place(index.place())) {
+  if (index.place().GetType() != phi::AllocationType::CPU) {
    phi::Copy(ctx, index, phi::CPUPlace(), true, &index_cpu_copy);
  }
-  const IndexT* index_data = paddle::platform::is_cpu_place(index.place())
+  const IndexT* index_data = index.place().GetType() == phi::AllocationType::CPU
                                 ? index.data<IndexT>()
                                 : index_cpu_copy.data<IndexT>();
  ctx.template Alloc<T>(output);

--- a/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_grad_kernel.cc
@@ -160,5 +160,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
                   int64_t,
                   float,
                   double,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/overlap_add_kernel.cc
+++ b/paddle/phi/kernels/cpu/overlap_add_kernel.cc
@@ -146,5 +146,5 @@ PD_REGISTER_KERNEL(overlap_add,
                   int64_t,
                   float,
                   double,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -33,7 +33,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
                            DenseTensor* x_grad,
                            DenseTensor* value_grad) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
      true,
      errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU."));


--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -32,7 +32,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                        const std::string& reduce,
                        DenseTensor* out) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
      true,
      errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU."));


--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -62,7 +62,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                        paddle::framework::DataTypeToString(
                            paddle::framework::proto::VarType::INT64)));

-  paddle::platform::DeviceContextPool::Instance().Get(repeats_tensor.place());
+  phi::DeviceContextPool::Instance().Get(repeats_tensor.place());
  if (index_type == paddle::framework::proto::VarType::INT32) {
    phi::funcs::RepeatsTensor2IndexTensor<Context, int>(
        ctx, repeats_tensor, &index);

--- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -30,7 +30,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
                             int axis,
                             DenseTensor* x_grad) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
      true,
      errors::PreconditionNotMet("This kernel only runs on CPU."));


--- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
                         int axis,
                         DenseTensor* out) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_cpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
      true,
      errors::PreconditionNotMet("This kernel only runs on CPU."));


--- a/paddle/phi/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/blas/CMakeLists.txt
 cc_library(
  blas
  SRCS blas.cc
-  DEPS cblas framework_proto device_context)
+  DEPS cblas framework_proto phi_backends)
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"

--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -16,7 +16,7 @@

 #include "glog/logging.h"

-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/layout.h"

--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <type_traits>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/gru_compute.h"

--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <type_traits>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/lstm_compute.h"
@@ -218,7 +218,7 @@ __global__ void KeLstmBackward(Op op,
 }

 template <class T, class Op>
-void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
+void gpu_lstm_forward(const phi::DeviceContext& context,
                      Op op,
                      phi::funcs::LstmMetaValue<T> value,
                      int frame_size,
@@ -269,7 +269,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
 }

 template <class T, class Op>
-void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
+void gpu_lstm_backward(const phi::DeviceContext& context,
                       Op op,
                       phi::funcs::LstmMetaValue<T> value,
                       phi::funcs::LstmMetaGrad<T> grad,

--- a/paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h
+++ b/paddle/phi/kernels/funcs/distribute_fpn_proposals_functor.h
@@ -35,8 +35,8 @@ inline std::vector<size_t> GetLodFromRoisNum(const Context& dev_ctx,
  std::vector<size_t> rois_lod;
  auto* rois_num_data = rois_num->data<int>();
  DenseTensor cpu_tensor;
-  if (paddle::platform::is_gpu_place(rois_num->place()) ||
-      paddle::platform::is_xpu_place(rois_num->place())) {
+  if (rois_num->place().GetType() == phi::AllocationType::GPU ||
+      rois_num->place().GetType() == phi::AllocationType::XPU) {
    Copy<Context>(dev_ctx, *rois_num, phi::CPUPlace(), true, &cpu_tensor);
    rois_num_data = cpu_tensor.data<int>();
  }

--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -986,7 +986,7 @@ static void ElemwiseGradBroadcast1CUDA(gpuStream_t stream,
    dim3 grid_size = dim3((w + BLOCK_X - 1) / BLOCK_X);
    auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
    auto *ctx = static_cast<GPUContext *>(
-        paddle::platform::DeviceContextPool::Instance().Get(gplace));
+        phi::DeviceContextPool::Instance().Get(gplace));
    phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
    FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
        x, y, out, dout, h, w, is_xsize_larger, dx_op, dy_op, dx, dy);
@@ -1010,8 +1010,8 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
  int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
  dim3 grid_size = dim3(n);
  auto gplace = phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId());
-  auto *ctx = static_cast<GPUContext *>(
-      paddle::platform::DeviceContextPool::Instance().Get(gplace));
+  auto *ctx =
+      static_cast<GPUContext *>(phi::DeviceContextPool::Instance().Get(gplace));
  phi::backends::gpu::LimitGridDim(*ctx, &grid_size);
  ElemwiseGradBroadcast2CUDAKernel<<<grid_size, block_size, 0, stream>>>(
      x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);

--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/phi/kernels/funcs/fc_functor.h"

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/jit/kernels.h"


--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -14,7 +14,7 @@ limitations under the License. */

 #include <algorithm>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"

--- a/paddle/phi/kernels/funcs/fc_functor.h
+++ b/paddle/phi/kernels/funcs/fc_functor.h
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <string>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"

 namespace phi {
 namespace funcs {

--- a/paddle/phi/kernels/funcs/gru_compute.cu
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -9,12 +9,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <paddle/fluid/platform/device_context.h>
+#include "paddle/phi/kernels/funcs/gru_compute.h"

+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-#include "paddle/phi/kernels/funcs/gru_compute.h"

 namespace phi {
 namespace funcs {

--- a/paddle/phi/kernels/funcs/gru_compute.h
+++ b/paddle/phi/kernels/funcs/gru_compute.h
@@ -11,7 +11,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"


--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -14,6 +14,7 @@

 #pragma once

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -122,20 +123,20 @@ inline std::vector<T> get_new_data_from_tensor(
  DenseTensor cpu_starts_tensor;
  auto& pool = phi::DeviceContextPool::Instance();
  phi::DeviceContext* dev_ctx = pool.Get(new_data_tensor->place());
-  if (paddle::platform::is_gpu_place(new_data_tensor->place())) {
+  if (new_data_tensor->place().GetType() == phi::AllocationType::GPU) {
    phi::Copy(
        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();
  }
 #ifdef PADDLE_WITH_ASCEND_CL
-  if (paddle::platform::is_npu_place(new_data_tensor->place())) {
+  if (new_data_tensor->place().GetType() == phi::AllocationType::NPU) {
    phi::Copy(
        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();
  }
 #endif
 #ifdef PADDLE_WITH_XPU
-  if (paddle::platform::is_xpu_place(new_data_tensor->place())) {
+  if (new_data_tensor->place().GetType() == phi::AllocationType::XPU) {
    phi::Copy(
        *dev_ctx, *new_data_tensor, phi::CPUPlace(), true, &cpu_starts_tensor);
    new_data = cpu_starts_tensor.data<T>();

--- a/paddle/phi/kernels/funcs/lstm_compute.h
+++ b/paddle/phi/kernels/funcs/lstm_compute.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"


--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>

+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
@@ -52,22 +53,6 @@ template struct SetConstant<phi::CPUContext, phi::dtype::complex<float>>;
 template struct SetConstant<phi::CPUContext, phi::dtype::complex<double>>;

 #ifdef PADDLE_WITH_XPU
-template struct SetConstant<paddle::platform::XPUDeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::XPUDeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, float>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, double>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, int>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::XPUDeviceContext, bool>;
-template struct SetConstant<paddle::platform::XPUDeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::XPUDeviceContext,
-                            phi::dtype::complex<double>>;
-
 template struct SetConstant<phi::XPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::XPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::XPUContext, float>;
@@ -164,10 +149,9 @@ struct TensorSetConstantCPU {
 };

 template <>
-void set_constant_with_place<paddle::platform::XPUPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::XPUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
 #ifdef PADDLE_WITH_XPU
  phi::VisitDataType(
      tensor->dtype(),
@@ -178,64 +162,54 @@ void set_constant_with_place<paddle::platform::XPUPlace>(
 }

 template <>
-void set_constant_with_place<paddle::platform::NPUPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::NPUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
  PADDLE_THROW(phi::errors::Unimplemented("NPUPlace is not supported"));
 }

 template <>
-void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::NPUPinnedPlace>(
+    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
  PADDLE_THROW(phi::errors::Unimplemented("NPUPinnedPlace is not supported"));
 }

 template <>
-void set_constant_with_place<paddle::platform::IPUPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::IPUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
  PADDLE_THROW(phi::errors::Unimplemented("IPUPlace is not supported"));
 }

 template <>
-void set_constant_with_place<paddle::platform::CustomPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::CustomPlace>(
+    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
  PADDLE_THROW(phi::errors::Unimplemented("CustomPlace is not supported"));
 }

 template <>
-void set_constant_with_place<phi::CPUPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
  phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }

 template <>
-void set_constant_with_place<paddle::platform::MLUPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::MLUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
  PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
 }

 template <>
-void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::GPUPinnedPlace>(
+    const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
  phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
 }

 struct TensorSetConstantWithPlace
-    : public std::unary_function<paddle::platform::Place, void> {
-  TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
+    : public std::unary_function<phi::Place, void> {
+  TensorSetConstantWithPlace(const phi::DeviceContext& context,
                             phi::DenseTensor* tensor,
                             float value)
      : context_(context), tensor_(tensor), value_(value) {}
@@ -245,17 +219,17 @@ struct TensorSetConstantWithPlace
    set_constant_with_place<Place>(context_, tensor_, value_);
  }

-  const paddle::platform::DeviceContext& context_;
+  const phi::DeviceContext& context_;
  phi::DenseTensor* tensor_;
  float value_;
 };

-void set_constant(const paddle::platform::DeviceContext& context,
+void set_constant(const phi::DeviceContext& context,
                  phi::DenseTensor* tensor,
                  float value) {
  TensorSetConstantWithPlace func(context, tensor, value);
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  if (paddle::platform::is_custom_place(context.GetPlace())) {
+  if (context.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
    func(phi::CPUPlace());
    return;
  }

--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/data_type.h"
@@ -187,8 +188,8 @@ void TransposeNormal<DeviceContext, T>::operator()(
  auto* out_ptr = out->data<T>();

  // copy in_stride, out_stride, axis to gpu device
-  const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
-  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+  const phi::GPUPlace& cuda_place = context.GetPlace();
+  phi::CPUPlace cpu_place = phi::CPUPlace();
  size_t size = 3 * rank * sizeof(int64_t);
  auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
  auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
@@ -231,7 +232,7 @@ struct TransposeNormal<phi::GPUContext, T> {

    // copy in_stride, out_stride, axis to gpu device
    const phi::GPUPlace& cuda_place = context.GetPlace();
-    phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    phi::CPUPlace cpu_place = phi::CPUPlace();
    size_t size = 3 * rank * sizeof(int64_t);
    auto cpu_buf_holder = phi::memory_utils::Alloc(cpu_place, size);
    auto cuda_buf_holder = phi::memory_utils::Alloc(cuda_place, size);
@@ -286,7 +287,7 @@ DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<float>);
 DEFINE_GPU_TRANS_NORMAL(phi::dtype::complex<double>);

 struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const paddle::platform::DeviceContext& context,
+  TensorSetConstantGPU(const phi::DeviceContext& context,
                       phi::DenseTensor* tensor,
                       float value)
      : context_(context), tensor_(tensor), value_(value) {}
@@ -299,16 +300,15 @@ struct TensorSetConstantGPU {
            static_cast<T>(value_));
  }

-  const paddle::platform::DeviceContext& context_;
+  const phi::DeviceContext& context_;
  phi::DenseTensor* tensor_;
  float value_;
 };

 template <>
-void set_constant_with_place<paddle::platform::CUDAPlace>(
-    const paddle::platform::DeviceContext& context,
-    phi::DenseTensor* tensor,
-    float value) {
+void set_constant_with_place<phi::GPUPlace>(const phi::DeviceContext& context,
+                                            phi::DenseTensor* tensor,
+                                            float value) {
  phi::VisitDataType(tensor->dtype(),
                     TensorSetConstantGPU(context, tensor, value));
 }

--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/utils/data_type.h"
@@ -56,24 +56,19 @@ struct SetConstant {

 #ifdef PADDLE_WITH_XPU
 template <typename T>
-struct SetConstant<XPUContext, T> {
-  void operator()(const XPUContext& context, phi::DenseTensor* tensor, T num);
-};
-
-template <typename T>
-struct SetConstant<paddle::platform::XPUDeviceContext, T> {
-  void operator()(const paddle::platform::XPUDeviceContext& context,
+struct SetConstant<phi::XPUContext, T> {
+  void operator()(const phi::XPUContext& context,
                  phi::DenseTensor* tensor,
                  T num);
 };
 #endif

 template <typename Place>
-void set_constant_with_place(const paddle::platform::DeviceContext& context,
+void set_constant_with_place(const phi::DeviceContext& context,
                             phi::DenseTensor* tensor,
                             float value);

-void set_constant(const paddle::platform::DeviceContext& context,
+void set_constant(const phi::DeviceContext& context,
                  phi::DenseTensor* tensor,
                  float value);

@@ -109,9 +104,7 @@ struct RowwiseMean {
 #ifdef PADDLE_WITH_XPU
 template <typename U>
 struct TensorSetConstantXPU {
-  TensorSetConstantXPU(phi::DenseTensor* tensor,
-                       U value,
-                       paddle::platform::Place place)
+  TensorSetConstantXPU(phi::DenseTensor* tensor, U value, phi::Place place)
      : tensor_(tensor), value_(value), place_(place) {}
  template <typename T>
  void apply() const {
@@ -127,7 +120,7 @@ struct TensorSetConstantXPU {
  }
  phi::DenseTensor* tensor_;
  U value_;
-  paddle::platform::Place place_;
+  phi::Place place_;
 };
 #endif


--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
@@ -34,17 +34,9 @@ void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,

 #ifdef PADDLE_WITH_XPU
 template <typename T>
-void SetConstant<XPUContext, T>::operator()(const XPUContext& context,
-                                            phi::DenseTensor* tensor,
-                                            T num) {
-  phi::VisitDataType(tensor->dtype(),
-                     TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
-}
-template <typename T>
-void SetConstant<paddle::platform::XPUDeviceContext, T>::operator()(
-    const paddle::platform::XPUDeviceContext& context,
-    phi::DenseTensor* tensor,
-    T num) {
+void SetConstant<phi::XPUContext, T>::operator()(const phi::XPUContext& context,
+                                                 phi::DenseTensor* tensor,
+                                                 T num) {
  phi::VisitDataType(tensor->dtype(),
                     TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
 }
@@ -65,7 +57,7 @@ void Transpose<DeviceContext, T, Rank>::operator()(
  auto* dev = context.eigen_device();
  // use 32bit index to speed up computation
  bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
-  bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace());
+  bool is_gpu_place = context.GetPlace().GetType() == phi::AllocationType::GPU;
  if (use_32bit_index && is_gpu_place) {
    To32BitIndex(eigen_out).device(*dev) =
        To32BitIndex(eigen_in).shuffle(permute);

--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -415,7 +415,7 @@ struct ReduceConfig {
 #ifdef PADDLE_WITH_XPU_KP
    bool not_higher = x_dim[0] > 1;
 #else
-    int device_id = paddle::platform::GetCurrentDeviceId();
+    int device_id = phi::backends::gpu::GetCurrentDeviceId();
    int max_grid_z = phi::backends::gpu::GetGpuMaxGridDimSize(device_id)[2];
    bool not_higher = x_dim[0] >= max_grid_z;
 #endif  // PADDLE_WITH_XPU_KP
@@ -467,10 +467,10 @@ struct ReduceConfig {
      grid_num = details::CeilingDiv(left_num, block_dim->x);
      reduce_num_per_thread = details::CeilingDiv(reduce_num, block_dim->y);
    }
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int device_id = phi::backends::gpu::GetCurrentDeviceId();
+    int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+        phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
    int max_threads = max_threads_per_mp * max_mp;
    int num_threads = block_dim->x * block_dim->y;
    int max_num_blocks = max_threads / num_threads;
@@ -509,10 +509,10 @@ struct ReduceConfig {
    int grid_z = left_num / last_dim_num;
    left_num = last_dim_num;
    grid_dim->z = grid_z;
-    int device_id = paddle::platform::GetCurrentDeviceId();
-    int max_mp = paddle::platform::GetGPUMultiProcessors(device_id);
+    int device_id = phi::backends::gpu::GetCurrentDeviceId();
+    int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id);
    int max_threads_per_mp =
-        paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id);
+        phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id);
    int max_threads = max_threads_per_mp * max_mp;
    // init
    int num_block = (max_threads / left_num);

--- a/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h
+++ b/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.h
@@ -22,13 +22,13 @@ void RepeatsTensor2IndexTensor(const Context& ctx,
                               const DenseTensor& repeats,
                               DenseTensor* index) {
  DenseTensor repeats_cpu_copy;
-  if (!paddle::platform::is_cpu_place(repeats.place())) {
-    phi::Copy(
-        ctx, repeats, paddle::platform::CPUPlace(), true, &repeats_cpu_copy);
+  if (repeats.place().GetType() != phi::AllocationType::CPU) {
+    phi::Copy(ctx, repeats, phi::CPUPlace(), true, &repeats_cpu_copy);
  }
-  const RepeatsT* repeats_data = paddle::platform::is_cpu_place(repeats.place())
-                                     ? repeats.data<RepeatsT>()
-                                     : repeats_cpu_copy.data<RepeatsT>();
+  const RepeatsT* repeats_data =
+      repeats.place().GetType() == phi::AllocationType::CPU
+          ? repeats.data<RepeatsT>()
+          : repeats_cpu_copy.data<RepeatsT>();

  int64_t index_size = 0;
  for (int i = 0; i < repeats.dims()[0]; i++) {

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -389,8 +389,8 @@ void SelectKernel(const KPDevice &dev_ctx,
  using CT = int64_t;  // set Count_data Type
  const int t_size = sizeof(CT);

-  const paddle::platform::CUDAPlace &cuda_place = dev_ctx.GetPlace();
-  paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+  const phi::GPUPlace &cuda_place = dev_ctx.GetPlace();
+  phi::CPUPlace cpu_place = phi::CPUPlace();

  // 1.1 get stored data num of per block
  const int kVecSize = 4;

--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -76,17 +76,17 @@ struct SelectedRowsAdd<phi::GPUContext, T> {
    auto* in1_data = in1_value.data<T>();

    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
+    PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
                      true,
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));
    auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in2_place),
+    PADDLE_ENFORCE_EQ(in2_place.GetType() == phi::AllocationType::GPU,
                      true,
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));
    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(out_place),
+    PADDLE_ENFORCE_EQ(out_place.GetType() == phi::AllocationType::GPU,
                      true,
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));
@@ -237,12 +237,12 @@ struct SelectedRowsAddTo<phi::GPUContext, T> {
    }

    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
+    PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
                      true,
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));
    auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(in1_place),
+    PADDLE_ENFORCE_EQ(in1_place.GetType() == phi::AllocationType::GPU,
                      true,
                      phi::errors::InvalidArgument(
                          "The running environment is not on the GPU place."));

--- a/paddle/phi/kernels/funcs/sequence2batch.h
+++ b/paddle/phi/kernels/funcs/sequence2batch.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/mixed_vector.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"


--- a/paddle/phi/kernels/funcs/sequence_padding.cc
+++ b/paddle/phi/kernels/funcs/sequence_padding.cc
@@ -18,6 +18,7 @@ limitations under the License. */

 #ifdef PADDLE_WITH_XPU
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
 #endif

 namespace phi {

--- a/paddle/phi/kernels/funcs/sequence_padding.h
+++ b/paddle/phi/kernels/funcs/sequence_padding.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>

-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/mixed_vector.h"

--- a/paddle/phi/kernels/funcs/sequence_scale.h
+++ b/paddle/phi/kernels/funcs/sequence_scale.h
@@ -14,7 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"


--- a/paddle/phi/kernels/funcs/tensor_to_string.h
+++ b/paddle/phi/kernels/funcs/tensor_to_string.h
@@ -16,7 +16,7 @@

 #include <sstream>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -33,7 +33,7 @@ static const std::vector<T> &ToVector(const std::vector<T> &vec) {
 template <typename T>
 static std::vector<T> ToVector(const T *x, size_t n, const phi::Place &place) {
 #ifdef __NVCC__
-  if (paddle::platform::is_gpu_place(place)) {
+  if (place.GetType() == phi::AllocationType::GPU) {
    using CopyT = typename std::
        conditional<std::is_same<T, bool>::value, uint8_t, T>::type;
    std::vector<CopyT> cpu_x(n);

--- a/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d_fusion.cu
@@ -64,7 +64,7 @@ void Conv2dFusionKernel(const Context& ctx,
    pad_w0 = paddings[2];
    pad_w1 = paddings[3];
  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
        "Attr paddins in conv2d_fusion must have 2 or 4 elements, but now have "
        "%u elements.",
        paddings.size()));
@@ -111,7 +111,7 @@ void Conv2dFusionKernel(const Context& ctx,
      params.residual = reinterpret_cast<const half*>(residual->data<T>());
      Conv2dBiasAddRelu(params);
    } else {
-      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+      PADDLE_THROW(phi::errors::InvalidArgument(
          "Cutlass now only support relu activation in a residual block"));
    }
  } else if (activation == "relu") {

--- a/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/moe_kernel.cu
@@ -319,7 +319,7 @@ void InitMoeRoutingKernelLauncher(
          ec_route);
    }
  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+    PADDLE_THROW(phi::errors::InvalidArgument(
        "Currently only support `ec_route = True`. "));
  }
 }
@@ -401,7 +401,7 @@ void GenericMoeGemmKernelLauncher(const T* A,
  int occupancy = GemmGrouped::maximum_active_blocks();
  const int threadblock_count = multi_processor_count * occupancy;
  if (occupancy == 0) {
-    PADDLE_THROW(paddle::platform::errors::Fatal(
+    PADDLE_THROW(phi::errors::Fatal(
        "[MoE Runner] GPU lacks the shared memory resources to run GroupedGEMM "
        "kernel"));
  }
@@ -425,21 +425,21 @@ void GenericMoeGemmKernelLauncher(const T* A,
  if (can_implement != cutlass::Status::kSuccess) {
    std::string err_msg = "MoEFC kernel will fail for params. Error: " +
                          std::string(cutlassGetStatusString(can_implement));
-    PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
+    PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
  }
  auto init_status = gemm.initialize(args);
  if (init_status != cutlass::Status::kSuccess) {
    std::string err_msg =
        "Failed to initialize cutlass variable batched gemm. Error: " +
        std::string(cutlassGetStatusString(init_status));
-    PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
+    PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
  }
  auto run_status = gemm.run(stream);
  if (run_status != cutlass::Status::kSuccess) {
    std::string err_msg =
        "Failed to run cutlass variable batched gemm. Error: " +
        std::string(cutlassGetStatusString(run_status));
-    PADDLE_THROW(paddle::platform::errors::Fatal("[MoE Runner] " + err_msg));
+    PADDLE_THROW(phi::errors::Fatal("[MoE Runner] " + err_msg));
  }
 }


--- a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
@@ -22,6 +22,7 @@
 #include <memory>
 #include <unordered_map>

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/common/backend.h"
@@ -313,12 +314,8 @@ class CudnnConvDescManager {
      int groups,
      cudnnDataType_t dtype) {
    auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype,
-              paddings,
-              strides,
-              dilations,
-              paddle::platform::AllowTF32Cudnn(),
-              groups);
+    desc->set(
+        dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
    return desc;
  }


--- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -16,7 +16,6 @@

 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"

-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"

--- a/paddle/phi/kernels/gpu/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_kernel.cu
@@ -16,7 +16,7 @@

 #include "paddle/phi/kernels/affine_grid_kernel.h"

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"

--- a/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/class_center_sample_kernel.cu
@@ -369,10 +369,9 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
      // use global calculate stream
      const auto calcu_stream =
          static_cast<GPUContext*>(
-              paddle::platform::DeviceContextPool::Instance().Get(
-                  dev_ctx.GetPlace()))
+              phi::DeviceContextPool::Instance().Get(dev_ctx.GetPlace()))
              ->stream();
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
          num_classes_per_device_ptr,
          num_classes_per_device_ptr,
          num_classes_per_device.numel(),

--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>

-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/hostdevice.h"

 #ifdef __NVCC__
@@ -1721,34 +1721,36 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,

 template class DepthwiseConvFunctor<phi::GPUContext, float, false>;
 template class DepthwiseConvFunctor<phi::GPUContext, double, false>;
-template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, false>;
+template class DepthwiseConvFunctor<phi::GPUContext,
+                                    phi::dtype::float16,
+                                    false>;

 template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, false>;
 template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, false>;
 template class DepthwiseConvInputGradFunctor<phi::GPUContext,
-                                             platform::float16,
+                                             phi::dtype::float16,
                                             false>;

 template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, false>;
 template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, false>;
 template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
-                                              platform::float16,
+                                              phi::dtype::float16,
                                              false>;

 template class DepthwiseConvFunctor<phi::GPUContext, float, true>;
 template class DepthwiseConvFunctor<phi::GPUContext, double, true>;
-template class DepthwiseConvFunctor<phi::GPUContext, platform::float16, true>;
+template class DepthwiseConvFunctor<phi::GPUContext, phi::dtype::float16, true>;

 template class DepthwiseConvInputGradFunctor<phi::GPUContext, float, true>;
 template class DepthwiseConvInputGradFunctor<phi::GPUContext, double, true>;
 template class DepthwiseConvInputGradFunctor<phi::GPUContext,
-                                             platform::float16,
+                                             phi::dtype::float16,
                                             true>;

 template class DepthwiseConvFilterGradFunctor<phi::GPUContext, float, true>;
 template class DepthwiseConvFilterGradFunctor<phi::GPUContext, double, true>;
 template class DepthwiseConvFilterGradFunctor<phi::GPUContext,
-                                              platform::float16,
+                                              phi::dtype::float16,
                                              true>;

 }  // namespace math

--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -385,17 +385,17 @@ void InstanceNormGradKernel(const Context &dev_ctx,
  miopenTensorDescriptor_t in_param_desc_;

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
  cudnnTensorDescriptor_t data_desc_;
  cudnnTensorDescriptor_t in_param_desc_;

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif

  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
@@ -406,27 +406,23 @@ void InstanceNormGradKernel(const Context &dev_ctx,
  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);

 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenSetTensorDescriptor(
-          data_desc_,
-          CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4,
-          const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
-          in_param_desc_, data_desc_, miopenBNSpatial));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_,
-          CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4,
-          dims.data(),
-          strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
-          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif

  const auto *saved_mean_data =
@@ -435,49 +431,47 @@ void InstanceNormGradKernel(const Context &dev_ctx,
      saved_variance.template data<BatchNormParamType<T>>();
  if (d_scale && d_bias) {
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::miopenBatchNormalizationBackward(
-            dev_ctx.cudnn_handle(),
-            miopenBNSpatial,
-            CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(),
-            CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(),
-            data_desc_,
-            x_tmp.template data<T>(),
-            data_desc_,
-            d_y_tmp.template data<T>(),
-            data_desc_,
-            d_x->template data<T>(),
-            in_param_desc_,
-            scale_tmp.template data<BatchNormParamType<T>>(),
-            d_scale_tmp.template data<BatchNormParamType<T>>(),
-            d_bias_tmp.template data<BatchNormParamType<T>>(),
-            epsilon,
-            saved_mean_data,
-            saved_var_data));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
+        dev_ctx.cudnn_handle(),
+        miopenBNSpatial,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
 #else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::cudnnBatchNormalizationBackward(
-            dev_ctx.cudnn_handle(),
-            CUDNN_BATCHNORM_SPATIAL,
-            CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(),
-            CudnnDataType<T>::kOne(),
-            CudnnDataType<T>::kZero(),
-            data_desc_,
-            x_tmp.template data<T>(),
-            data_desc_,
-            d_y_tmp.template data<T>(),
-            data_desc_,
-            d_x->template data<T>(),
-            in_param_desc_,
-            scale_tmp.template data<BatchNormParamType<T>>(),
-            d_scale_tmp.template data<BatchNormParamType<T>>(),
-            d_bias_tmp.template data<BatchNormParamType<T>>(),
-            epsilon,
-            saved_mean_data,
-            saved_var_data));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
+        dev_ctx.cudnn_handle(),
+        CUDNN_BATCHNORM_SPATIAL,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
 #endif
  } else {
    if (d_x) {
@@ -502,14 +496,14 @@ void InstanceNormGradKernel(const Context &dev_ctx,

 #ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
 }


--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -63,17 +63,17 @@ void InstanceNormKernel(const Context &dev_ctx,
  miopenTensorDescriptor_t in_param_desc_;

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
 #else
  cudnnTensorDescriptor_t data_desc_;
  cudnnTensorDescriptor_t in_param_desc_;

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
 #endif
  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
    LOG(ERROR) << "Provided epsilon is smaller than "
@@ -89,27 +89,23 @@ void InstanceNormKernel(const Context &dev_ctx,
  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};

 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenSetTensorDescriptor(
-          data_desc_,
-          CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4,
-          const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDeriveBNTensorDescriptor(
-          in_param_desc_, data_desc_, miopenBNSpatial));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
 #else
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_,
-          CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4,
-          dims.data(),
-          strides.data()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
-          in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif

  const auto scale_ptr = scale.get_ptr();
@@ -170,7 +166,7 @@ void InstanceNormKernel(const Context &dev_ctx,

 #ifdef PADDLE_WITH_HIP
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenBatchNormalizationForwardTraining(
+      phi::dynload::miopenBatchNormalizationForwardTraining(
          handle,
          miopenBNSpatial,
          const_cast<void *>(
@@ -194,12 +190,12 @@ void InstanceNormKernel(const Context &dev_ctx,
          static_cast<void *>(saved_variance_data)));

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
 #else
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+      phi::dynload::cudnnBatchNormalizationForwardTraining(
          handle,
          CUDNN_BATCHNORM_SPATIAL,
          CudnnDataType<T>::kOne(),
@@ -219,9 +215,9 @@ void InstanceNormKernel(const Context &dev_ctx,
          saved_variance_data));

  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
 #endif
 }


--- a/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_grad_kernel.cu
@@ -89,11 +89,10 @@ void GetClassInterval(const gpuStream_t& stream,
        paddle::platform::NCCLCommContext::Instance().Get(rid, place);
    // use global calculate stream
    const auto calcu_stream =
-        static_cast<GPUContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place))
+        static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
            ->stream();

-    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
        num_classes_per_device_ptr,
        num_classes_per_device_ptr,
        num_classes_per_device.numel(),

--- a/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/margin_cross_entropy_kernel.cu
@@ -85,11 +85,10 @@ void GetClassInterval(const gpuStream_t& stream,
        paddle::platform::NCCLCommContext::Instance().Get(rid, place);
    // use global calculate stream
    const auto calcu_stream =
-        static_cast<GPUContext*>(
-            paddle::platform::DeviceContextPool::Instance().Get(place))
+        static_cast<GPUContext*>(phi::DeviceContextPool::Instance().Get(place))
            ->stream();

-    PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
        num_classes_per_device_ptr,
        num_classes_per_device_ptr,
        num_classes_per_device.numel(),
@@ -247,7 +246,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,

      // use global calculate stream
      stream = static_cast<GPUContext*>(
-                   paddle::platform::DeviceContextPool::Instance().Get(place))
+                   phi::DeviceContextPool::Instance().Get(place))
                   ->stream();
    }
  }
@@ -358,7 +357,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
      auto task = pg->AllReduce(in_tensor, out_tensor, opts);
      task->Wait();
    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
          logits_max_buff,
          logits_max_buff,
          logits_max.numel(),
@@ -400,7 +399,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
      auto task = pg->AllReduce(in_tensor, out_tensor, opts);
      task->Wait();
    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
          sum_exp_logits_buff,
          sum_exp_logits_buff,
          sum_exp_logits.numel(),
@@ -459,7 +458,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
      auto task = pg->AllReduce(in_tensor, out_tensor, opts);
      task->Wait();
    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
          loss_ptr,
          loss_ptr,
          loss->numel(),

--- a/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
@@ -161,5 +161,5 @@ PD_REGISTER_KERNEL(overlap_add_grad,
                   float,
                   double,
                   phi::dtype::float16,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/overlap_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_kernel.cu
@@ -147,5 +147,5 @@ PD_REGISTER_KERNEL(overlap_add,
                   float,
                   double,
                   phi::dtype::float16,
-                   paddle::platform::complex<float>,
-                   paddle::platform::complex<double>) {}
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -32,7 +32,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
                            const std::string& reduce,
                            DenseTensor* x_grad,
                            DenseTensor* value_grad) {
-  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+  PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
                    true,
                    errors::PreconditionNotMet(
                        "PutAlongAxisGradOpCUDAKernel only runs on GPU."));

--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -31,7 +31,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                        int axis,
                        const std::string& reduce,
                        DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+  PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
                    true,
                    errors::PreconditionNotMet(
                        "PutAlongAxisCUDAKernel only runs on GPU device."));

--- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc
@@ -260,108 +260,104 @@ void RnnGradKernel(const Context &dev_ctx,
  if (!has_seq_length) {
    if (x_grad) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenRNNBackwardData(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.y_descs(),
-              out_data,
-              rnn.y_descs(),
-              out_grad_data,
-              rnn.last_h_desc(),
-              last_h_grad_data,
-              rnn.last_c_desc(),
-              last_c_grad_data,
-              rnn.weight_desc(),
-              weight_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.x_descs(),
-              x_grad_data,
-              rnn.init_h_desc(),
-              init_h_grad_data,
-              rnn.init_c_desc(),
-              init_c_grad_data,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardData(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.y_descs(),
+          out_data,
+          rnn.y_descs(),
+          out_grad_data,
+          rnn.last_h_desc(),
+          last_h_grad_data,
+          rnn.last_c_desc(),
+          last_c_grad_data,
+          rnn.weight_desc(),
+          weight_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.x_descs(),
+          x_grad_data,
+          rnn.init_h_desc(),
+          init_h_grad_data,
+          rnn.init_c_desc(),
+          init_c_grad_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
 #else
      // This interface is used when the input/output is unpadded.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNBackwardData(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.y_descs(),
-              out_data,
-              rnn.y_descs(),
-              out_grad_data,
-              rnn.last_h_desc(),
-              last_h_grad_data,
-              rnn.last_c_desc(),
-              last_c_grad_data,
-              rnn.weight_desc(),
-              weight_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.x_descs(),
-              x_grad_data,
-              rnn.init_h_desc(),
-              init_h_grad_data,
-              rnn.init_c_desc(),
-              init_c_grad_data,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.y_descs(),
+          out_data,
+          rnn.y_descs(),
+          out_grad_data,
+          rnn.last_h_desc(),
+          last_h_grad_data,
+          rnn.last_c_desc(),
+          last_c_grad_data,
+          rnn.weight_desc(),
+          weight_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.x_descs(),
+          x_grad_data,
+          rnn.init_h_desc(),
+          init_h_grad_data,
+          rnn.init_c_desc(),
+          init_c_grad_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
 #endif
    }
    if (!weight_grad_list.empty()) {
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenRNNBackwardWeights(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.x_descs(),
-              x.data<T>(),
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.y_descs(),
-              out.data<T>(),
-              rnn.weight_desc(),
-              weight_grad_data,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x.data<T>(),
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.y_descs(),
+          out.data<T>(),
+          rnn.weight_desc(),
+          weight_grad_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
      // permute weight grad list from weight grad tensor
      TensorToPermutedWeight<T>(
          place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec);
 #else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNBackwardWeights(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.x_descs(),
-              x.data<T>(),
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.y_descs(),
-              out.data<T>(),
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              rnn.weight_desc(),
-              weight_grad_data,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x.data<T>(),
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.y_descs(),
+          out.data<T>(),
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          rnn.weight_desc(),
+          weight_grad_data,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
 #endif
    }
  } else {
@@ -369,57 +365,55 @@ void RnnGradKernel(const Context &dev_ctx,
    // for train
    // This interface is used when the input/output is padded.
    if (x_grad) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNBackwardDataEx(
-              handle,
-              rnn.rnn_desc(),
-              rnn.y_seq_desc(),
-              out_data,
-              rnn.y_seq_desc(),
-              out_grad_data,
-              nullptr,
-              nullptr,
-              rnn.last_h_desc(),
-              last_h_grad_data,
-              rnn.last_c_desc(),
-              last_c_grad_data,
-              rnn.weight_desc(),
-              weight_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.x_seq_desc(),
-              x_grad_data,
-              rnn.init_h_desc(),
-              init_h_grad_data,
-              rnn.init_c_desc(),
-              init_c_grad_data,
-              nullptr,
-              nullptr,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.y_seq_desc(),
+          out_grad_data,
+          nullptr,
+          nullptr,
+          rnn.last_h_desc(),
+          last_h_grad_data,
+          rnn.last_c_desc(),
+          last_c_grad_data,
+          rnn.weight_desc(),
+          weight_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.x_seq_desc(),
+          x_grad_data,
+          rnn.init_h_desc(),
+          init_h_grad_data,
+          rnn.init_c_desc(),
+          init_c_grad_data,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
    }

    if (!weight_grad_list.empty()) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNBackwardWeightsEx(
-              handle,
-              rnn.rnn_desc(),
-              rnn.x_seq_desc(),
-              x.data<T>(),
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.y_seq_desc(),
-              out.data<T>(),
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              rnn.weight_desc(),
-              weight_grad_data,
-              const_cast<uint8_t *>(reserve_data),
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x.data<T>(),
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.y_seq_desc(),
+          out.data<T>(),
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          rnn.weight_desc(),
+          weight_grad_data,
+          const_cast<uint8_t *>(reserve_data),
+          reserve_size));
    }
 #else
    PADDLE_THROW(phi::errors::Unavailable(

--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -42,81 +42,78 @@ void RNNInferece(bool has_seq_length,
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::miopenRNNForwardInference(
-            handle,
-            rnn->rnn_desc(),
-            seq_length,
-            rnn->x_descs(),
-            x_data,
-            rnn->init_h_desc(),
-            init_h_data,
-            rnn->init_c_desc(),
-            init_c_data,
-            rnn->weight_desc(),
-            w_data,
-            rnn->y_descs(),
-            out_data,
-            rnn->last_h_desc(),
-            last_h_data,
-            rnn->last_c_desc(),
-            last_c_data,
-            workspace_data->data<uint8_t>(),
-            workspace_size));
+        phi::dynload::miopenRNNForwardInference(handle,
+                                                rnn->rnn_desc(),
+                                                seq_length,
+                                                rnn->x_descs(),
+                                                x_data,
+                                                rnn->init_h_desc(),
+                                                init_h_data,
+                                                rnn->init_c_desc(),
+                                                init_c_data,
+                                                rnn->weight_desc(),
+                                                w_data,
+                                                rnn->y_descs(),
+                                                out_data,
+                                                rnn->last_h_desc(),
+                                                last_h_data,
+                                                rnn->last_c_desc(),
+                                                last_c_data,
+                                                workspace_data->data<uint8_t>(),
+                                                workspace_size));
 #else
    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::cudnnRNNForwardInference(
-            handle,
-            rnn->rnn_desc(),
-            seq_length,
-            rnn->x_descs(),
-            x_data,
-            rnn->init_h_desc(),
-            init_h_data,
-            rnn->init_c_desc(),
-            init_c_data,
-            rnn->weight_desc(),
-            w_data,
-            rnn->y_descs(),
-            out_data,
-            rnn->last_h_desc(),
-            last_h_data,
-            rnn->last_c_desc(),
-            last_c_data,
-            workspace_data->data<uint8_t>(),
-            workspace_size));
+        phi::dynload::cudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
 #endif
  } else {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
    // for inference
    // This interface is used when the input/output is padded.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        paddle::platform::dynload::cudnnRNNForwardInferenceEx(
-            handle,
-            rnn->rnn_desc(),
-            rnn->x_seq_desc(),
-            x_data,
-            rnn->init_h_desc(),
-            init_h_data,
-            rnn->init_c_desc(),
-            init_c_data,
-            rnn->weight_desc(),
-            w_data,
-            rnn->y_seq_desc(),
-            out_data,
-            rnn->last_h_desc(),
-            last_h_data,
-            rnn->last_c_desc(),
-            last_c_data,
-            nullptr,
-            nullptr,
-            nullptr,
-            nullptr,
-            nullptr,
-            nullptr,
-            nullptr,
-            nullptr,
-            workspace_data->data<uint8_t>(),
-            workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+        handle,
+        rnn->rnn_desc(),
+        rnn->x_seq_desc(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_seq_desc(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
 #else
    // CUDNN VERSION has to >=7.2.1
    PADDLE_THROW(phi::errors::Unavailable(
@@ -310,88 +307,85 @@ void RnnKernel(const Context &dev_ctx,
 // for train
 // This interface is used when the input/output is unpadded.
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::miopenRNNForwardTraining(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.x_descs(),
-              x_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.weight_desc(),
-              w_data,
-              rnn.y_descs(),
-              out_data,
-              rnn.last_h_desc(),
-              last_h_data,
-              rnn.last_c_desc(),
-              last_c_data,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              reserve_data,
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
 #else
      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNForwardTraining(
-              handle,
-              rnn.rnn_desc(),
-              seq_length,
-              rnn.x_descs(),
-              x_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.weight_desc(),
-              w_data,
-              rnn.y_descs(),
-              out_data,
-              rnn.last_h_desc(),
-              last_h_data,
-              rnn.last_c_desc(),
-              last_c_data,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              reserve_data,
-              reserve_size));
+          phi::dynload::cudnnRNNForwardTraining(handle,
+                                                rnn.rnn_desc(),
+                                                seq_length,
+                                                rnn.x_descs(),
+                                                x_data,
+                                                rnn.init_h_desc(),
+                                                init_h_data,
+                                                rnn.init_c_desc(),
+                                                init_c_data,
+                                                rnn.weight_desc(),
+                                                w_data,
+                                                rnn.y_descs(),
+                                                out_data,
+                                                rnn.last_h_desc(),
+                                                last_h_data,
+                                                rnn.last_c_desc(),
+                                                last_c_data,
+                                                workspace_data_.data<uint8_t>(),
+                                                workspace_size,
+                                                reserve_data,
+                                                reserve_size));
 #endif
    } else {
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201
      // for train
      // This interface is used when the input/output is padded.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          paddle::platform::dynload::cudnnRNNForwardTrainingEx(
-              handle,
-              rnn.rnn_desc(),
-              rnn.x_seq_desc(),
-              x_data,
-              rnn.init_h_desc(),
-              init_h_data,
-              rnn.init_c_desc(),
-              init_c_data,
-              rnn.weight_desc(),
-              w_data,
-              rnn.y_seq_desc(),
-              out_data,
-              rnn.last_h_desc(),
-              last_h_data,
-              rnn.last_c_desc(),
-              last_c_data,
-              nullptr,
-              nullptr,
-              nullptr,
-              nullptr,
-              nullptr,
-              nullptr,
-              nullptr,
-              nullptr,
-              workspace_data_.data<uint8_t>(),
-              workspace_size,
-              reserve_data,
-              reserve_size));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
 #else
      PADDLE_THROW(phi::errors::Unavailable(
          "The padded input is supported by "

--- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu
@@ -83,7 +83,7 @@ void SyncBatchNormKernel(const Context &ctx,
    var_data = variance.template data<BatchNormParamType<T>>();
  } else {
    // x, x^2, 1, here 1 is used to calc device num
-    // device num also can be got from platform::DeviceContextPool
+    // device num also can be got from phi::DeviceContextPool
    const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType<T>);
    alloc_ptr = phi::memory_utils::Alloc(
        ctx.GetPlace(),
@@ -111,14 +111,14 @@ void SyncBatchNormKernel(const Context &ctx,
      int dtype = paddle::platform::ToNCCLDataType(
          paddle::framework::TransToProtoVarType(mean_out->dtype()));
      // In-place operation
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce(
-          stats,
-          stats,
-          2 * C + 1,
-          static_cast<ncclDataType_t>(dtype),
-          ncclSum,
-          comm,
-          stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::ncclAllReduce(stats,
+                                      stats,
+                                      2 * C + 1,
+                                      static_cast<ncclDataType_t>(dtype),
+                                      ncclSum,
+                                      comm,
+                                      stream));
      VLOG(3) << "Sync result using all reduce";
    }
 #endif

--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -31,7 +31,7 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
                             int axis,
                             DenseTensor* x_grad) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
      true,
      errors::PreconditionNotMet("This kernel only runs on GPU."));


--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -29,7 +29,7 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
                         int axis,
                         DenseTensor* out) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
      true,
      errors::PreconditionNotMet("This kernel only runs on GPU device."));


--- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -97,7 +97,7 @@ void TriangularSolveKernel(const Context& dev_ctx,

    memory_utils::Copy(dev_ctx.GetPlace(),
                       tmp_gpu_ptrs_data->ptr(),
-                       paddle::platform::CPUPlace(),
+                       phi::CPUPlace(),
                       static_cast<void*>(cpu_ptrs.data()),
                       cpu_ptrs.size() * sizeof(T*),
                       dev_ctx.stream());

--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -15,7 +15,7 @@
 #ifndef PADDLE_WITH_HIP

 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
-#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
@@ -35,7 +35,7 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
                               bool align_corners,
                               DenseTensor* input_grad) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
      true,
      phi::errors::InvalidArgument(
          "Only support for CUDAPlace.Please switch your context from "
@@ -58,9 +58,8 @@ void AffineGridGradCudnnKernel(const Context& dev_ctx,
  const T* output_grad_data = output_grad.data<T>();
  T* theta_grad_data = dev_ctx.template Alloc<T>(theta_grad);

-  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnSpatialTfGridGeneratorBackward(
-          handle, cudnn_st_desc, output_grad_data, theta_grad_data));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfGridGeneratorBackward(
+      handle, cudnn_st_desc, output_grad_data, theta_grad_data));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -15,7 +15,6 @@
 #ifndef PADDLE_WITH_HIP

 #include "paddle/phi/kernels/affine_grid_kernel.h"
-#include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
@@ -35,7 +34,7 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
                           bool align_corners,
                           DenseTensor* output) {
  PADDLE_ENFORCE_EQ(
-      paddle::platform::is_gpu_place(dev_ctx.GetPlace()),
+      dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
      true,
      phi::errors::InvalidArgument(
          "Only support for CUDAPlace.Please switch your context from "
@@ -56,12 +55,11 @@ void AffineGridCudnnKernel(const Context& dev_ctx,
  cudnnSpatialTransformerDescriptor_t cudnn_st_desc =
      st_desc.descriptor<T>(4, h_size_data);

-  PADDLE_ENFORCE_EQ(
-      paddle::platform::dynload::cudnnSpatialTfGridGeneratorForward(
-          handle, cudnn_st_desc, theta_data, output_data),
-      0,
-      phi::errors::Fatal("Some errors has occurred "
-                         "during forward computation in cudnn."));
+  PADDLE_ENFORCE_EQ(phi::dynload::cudnnSpatialTfGridGeneratorForward(
+                        handle, cudnn_st_desc, theta_data, output_data),
+                    0,
+                    phi::errors::Fatal("Some errors has occurred "
+                                       "during forward computation in cudnn."));
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>

+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"

--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -14,6 +14,7 @@

 #include "paddle/phi/kernels/conv_grad_kernel.h"

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -161,7 +162,7 @@ void ConvCudnnGradKernelImplV7(
                    padding_common,
                    strides,
                    dilations,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_groups);

 #ifdef PADDLE_WITH_HIP
@@ -188,7 +189,7 @@ void ConvCudnnGradKernelImplV7(
                    padding_common,
                    strides,
                    dilations,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_groups);
 #ifdef PADDLE_WITH_HIP
    using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -227,39 +228,38 @@ void ConvCudnnGradKernelImplV7(
      workspace_handle.RunFunc(
          [&](void* cudnn_workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::miopenConvolutionBackwardData(
-                    handle,
-                    &alpha,
-                    args1.odesc.desc(),
-                    output_grad_data,
-                    args1.wdesc.desc(),
-                    filter_data,
-                    args1.cdesc.desc(),
-                    bwd_result.algo,
-                    &beta,
-                    args1.idesc.desc(),
-                    temp_tensor_data,
-                    cudnn_workspace_ptr,
-                    workspace_size));
+                phi::dynload::miopenConvolutionBackwardData(handle,
+                                                            &alpha,
+                                                            args1.odesc.desc(),
+                                                            output_grad_data,
+                                                            args1.wdesc.desc(),
+                                                            filter_data,
+                                                            args1.cdesc.desc(),
+                                                            bwd_result.algo,
+                                                            &beta,
+                                                            args1.idesc.desc(),
+                                                            temp_tensor_data,
+                                                            cudnn_workspace_ptr,
+                                                            workspace_size));
          },
          workspace_size);
-      PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenOpTensor(
-          handle,
-          miopenTensorOpAdd,
-          &alpha,
-          args1.idesc.desc(),
-          transformed_input_grad_data,
-          &alpha,
-          args1.idesc.desc(),
-          temp_tensor_data,
-          &beta,
-          args1.idesc.desc(),
-          transformed_input_grad_data));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::miopenOpTensor(handle,
+                                       miopenTensorOpAdd,
+                                       &alpha,
+                                       args1.idesc.desc(),
+                                       transformed_input_grad_data,
+                                       &alpha,
+                                       args1.idesc.desc(),
+                                       temp_tensor_data,
+                                       &beta,
+                                       args1.idesc.desc(),
+                                       transformed_input_grad_data));
    } else {
      workspace_handle.RunFunc(
          [&](void* cudnn_workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::miopenConvolutionBackwardData(
+                phi::dynload::miopenConvolutionBackwardData(
                    handle,
                    &alpha,
                    args1.odesc.desc(),
@@ -300,7 +300,7 @@ void ConvCudnnGradKernelImplV7(
    workspace_handle.RunFunc(
        [&](void* cudnn_workspace_ptr) {
          PADDLE_ENFORCE_GPU_SUCCESS(
-              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+              phi::dynload::miopenConvolutionBackwardWeights(
                  handle,
                  &alpha,
                  args2.odesc.desc(),
@@ -356,7 +356,7 @@ void ConvCudnnGradKernelImplV8(
  PADDLE_ENFORCE_EQ(
      groups,
      1,
-      paddle::platform::errors::Unimplemented(
+      phi::errors::Unimplemented(
          "Group concolution using CUDNNv8 API is unsupported for now"));

  cudnnHandle_t handle = const_cast<cudnnHandle_t>(ctx.cudnn_handle());
@@ -1082,7 +1082,7 @@ void ConvCudnnGradGradKernel(
                      padding_common,
                      strides,
                      dilations,
-                      paddle::platform::AllowTF32Cudnn(),
+                      phi::AllowTF32Cudnn(),
                      c_group);

 #ifdef PADDLE_WITH_HIP
@@ -1106,7 +1106,7 @@ void ConvCudnnGradGradKernel(
                      padding_common,
                      strides,
                      dilations,
-                      paddle::platform::AllowTF32Cudnn(),
+                      phi::AllowTF32Cudnn(),
                      c_group);

 #ifdef PADDLE_WITH_HIP
@@ -1133,7 +1133,7 @@ void ConvCudnnGradGradKernel(
                    padding_common,
                    strides,
                    dilations,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);

 #ifdef PADDLE_WITH_HIP
@@ -1160,7 +1160,7 @@ void ConvCudnnGradGradKernel(
                    padding_common,
                    strides,
                    dilations,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);

 #ifdef PADDLE_WITH_HIP
@@ -1210,20 +1210,19 @@ void ConvCudnnGradGradKernel(
      workspace_handle.RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::miopenConvolutionForward(
-                    handle,
-                    &alpha,
-                    args1.idesc.desc(),
-                    ddx,
-                    args1.wdesc.desc(),
-                    w,
-                    args1.cdesc.desc(),
-                    fwd_result1.algo,
-                    &beta,
-                    args1.odesc.desc(),
-                    transformed_ddy_channel,
-                    workspace_ptr,
-                    workspace_size));
+                phi::dynload::miopenConvolutionForward(handle,
+                                                       &alpha,
+                                                       args1.idesc.desc(),
+                                                       ddx,
+                                                       args1.wdesc.desc(),
+                                                       w,
+                                                       args1.cdesc.desc(),
+                                                       fwd_result1.algo,
+                                                       &beta,
+                                                       args1.odesc.desc(),
+                                                       transformed_ddy_channel,
+                                                       workspace_ptr,
+                                                       workspace_size));
          },
          workspace_size);
 #else
@@ -1248,20 +1247,19 @@ void ConvCudnnGradGradKernel(
      workspace_handle.RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
-                paddle::platform::dynload::miopenConvolutionForward(
-                    handle,
-                    &alpha,
-                    args2.idesc.desc(),
-                    x,
-                    args2.wdesc.desc(),
-                    ddw,
-                    args2.cdesc.desc(),
-                    fwd_result2.algo,
-                    &beta,
-                    args2.odesc.desc(),
-                    transformed_ddy_channel,
-                    workspace_ptr,
-                    workspace_size));
+                phi::dynload::miopenConvolutionForward(handle,
+                                                       &alpha,
+                                                       args2.idesc.desc(),
+                                                       x,
+                                                       args2.wdesc.desc(),
+                                                       ddw,
+                                                       args2.cdesc.desc(),
+                                                       fwd_result2.algo,
+                                                       &beta,
+                                                       args2.odesc.desc(),
+                                                       transformed_ddy_channel,
+                                                       workspace_ptr,
+                                                       workspace_size));
          },
          workspace_size);
 #else
@@ -1291,7 +1289,7 @@ void ConvCudnnGradGradKernel(
    workspace_handle.RunFunc(
        [&](void* workspace_ptr) {
          PADDLE_ENFORCE_GPU_SUCCESS(
-              paddle::platform::dynload::miopenConvolutionBackwardWeights(
+              phi::dynload::miopenConvolutionBackwardWeights(
                  handle,
                  &alpha,
                  args3.odesc.desc(),
@@ -1330,7 +1328,7 @@ void ConvCudnnGradGradKernel(
    workspace_handle.RunFunc(
        [&](void* workspace_ptr) {
          PADDLE_ENFORCE_GPU_SUCCESS(
-              paddle::platform::dynload::miopenConvolutionBackwardData(
+              phi::dynload::miopenConvolutionBackwardData(
                  handle,
                  &alpha,
                  args4.odesc.desc(),

--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -14,6 +14,7 @@

 #include "paddle/phi/kernels/conv_kernel.h"

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -79,18 +80,11 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,

 #ifdef PADDLE_WITH_HIP
  // MIOPEN need to set groups in cdesc in miopen_desc.h
-  args.cdesc.set(dtype,
-                 padding_common,
-                 strides,
-                 dilations,
-                 paddle::platform::AllowTF32Cudnn(),
-                 groups);
+  args.cdesc.set(
+      dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype,
-                 padding_common,
-                 strides,
-                 dilations,
-                 paddle::platform::AllowTF32Cudnn());
+  args.cdesc.set(
+      dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif

 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
@@ -98,8 +92,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
  // FIXME(typhoonzero): find a better way to disable groups
  // rather than setting it to 1.
  PADDLE_ENFORCE_GPU_SUCCESS(
-      paddle::platform::dynload::cudnnSetConvolutionGroupCount(
-          args.cdesc.desc(), groups));
+      phi::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups));
  groups = 1;
 #endif
 #ifdef PADDLE_WITH_HIP
@@ -185,20 +178,19 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
  workspace_handle.RunFunc(
      [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
-            paddle::platform::dynload::miopenConvolutionForward(
-                handle,
-                &alpha,
-                args.idesc.desc(),
-                input_data,
-                args.wdesc.desc(),
-                filter_data,
-                args.cdesc.desc(),
-                fwd_result.algo,
-                &beta,
-                args.odesc.desc(),
-                output_data,
-                workspace_ptr,
-                workspace_size));
+            phi::dynload::miopenConvolutionForward(handle,
+                                                   &alpha,
+                                                   args.idesc.desc(),
+                                                   input_data,
+                                                   args.wdesc.desc(),
+                                                   filter_data,
+                                                   args.cdesc.desc(),
+                                                   fwd_result.algo,
+                                                   &beta,
+                                                   args.odesc.desc(),
+                                                   output_data,
+                                                   workspace_ptr,
+                                                   workspace_size));
      },
      workspace_size);
 #else
@@ -237,7 +229,7 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
  PADDLE_ENFORCE_EQ(
      groups,
      1,
-      paddle::platform::errors::Unimplemented(
+      phi::errors::Unimplemented(
          "Group concolution using CUDNNv8 API unsupported for now"));

  T* input_data = const_cast<T*>(input_tensor->data<T>());

--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <algorithm>

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
@@ -219,7 +220,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_groups);
 #ifdef PADDLE_WITH_HIP
    using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -244,7 +245,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_groups);
 #ifdef PADDLE_WITH_HIP
    using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -691,7 +692,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);
 #ifdef PADDLE_WITH_HIP
    using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
@@ -713,7 +714,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);
 #ifdef PADDLE_WITH_HIP
    using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
@@ -738,7 +739,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);
 #ifdef PADDLE_WITH_HIP
    using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -764,7 +765,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
                    padding_common,
                    strides,
                    dilations_,
-                    paddle::platform::AllowTF32Cudnn(),
+                    phi::AllowTF32Cudnn(),
                    c_group);
 #ifdef PADDLE_WITH_HIP
    using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;

--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <algorithm>

+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
@@ -216,7 +217,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
                 padding_common,
                 strides,
                 dilations_,
-                 paddle::platform::AllowTF32Cudnn(),
+                 phi::AllowTF32Cudnn(),
                 c_groups);

 #ifdef PADDLE_WITH_HIP

--- a/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_grad_kernel.cu
--- a/paddle/phi/kernels/gpudnn/pool_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/pool_kernel.cu
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
--- a/paddle/phi/kernels/impl/activation_impl.h
+++ b/paddle/phi/kernels/impl/activation_impl.h
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
--- a/paddle/phi/kernels/impl/warpctc_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warpctc_kernel_impl.h
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
--- a/paddle/phi/kernels/xpu/label_smooth_kernel.cc
+++ b/paddle/phi/kernels/xpu/label_smooth_kernel.cc
--- a/paddle/phi/kernels/xpu/lamb_kernel.cc
+++ b/paddle/phi/kernels/xpu/lamb_kernel.cc
--- a/paddle/phi/tests/api/test_allocator.cu
+++ b/paddle/phi/tests/api/test_allocator.cu
--- a/paddle/phi/tests/api/test_strings_empty_api.cc
+++ b/paddle/phi/tests/api/test_strings_empty_api.cc
--- a/paddle/phi/tests/api/test_strings_lower_upper_api.cc
+++ b/paddle/phi/tests/api/test_strings_lower_upper_api.cc
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
--- a/paddle/phi/tests/common/transform_test.cu
+++ b/paddle/phi/tests/common/transform_test.cu
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
--- a/paddle/phi/tests/core/test_mixed_vector.cu
+++ b/paddle/phi/tests/core/test_mixed_vector.cu
--- a/paddle/phi/tests/core/test_string_tensor.cc
+++ b/paddle/phi/tests/core/test_string_tensor.cc
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
--- a/paddle/phi/tests/kernels/sequence_padding_test.cc
+++ b/paddle/phi/tests/kernels/sequence_padding_test.cc
--- a/paddle/phi/tests/kernels/strided_memcpy_test.cc
+++ b/paddle/phi/tests/kernels/strided_memcpy_test.cc
--- a/paddle/phi/tests/kernels/test_auto_tune.cu
+++ b/paddle/phi/tests/kernels/test_auto_tune.cu
--- a/paddle/phi/tests/kernels/test_fused_adam_kernel.cc
+++ b/paddle/phi/tests/kernels/test_fused_adam_kernel.cc
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
--- a/paddle/phi/tests/kernels/test_memcpy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_memcpy_dev_api.cc
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cc
--- a/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_copy_dev_api.cu
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cc
--- a/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
+++ b/paddle/phi/tests/kernels/test_strings_lower_upper_dev_api.cu
--- a/paddle/phi/tests/kernels/test_ternary_broadcast.cu
+++ b/paddle/phi/tests/kernels/test_ternary_broadcast.cu
--- a/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_transfer_layout_dev_api.cc
--- a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
--- a/python/paddle/fluid/tests/custom_runtime/custom_op.cc
+++ b/python/paddle/fluid/tests/custom_runtime/custom_op.cc