[Pten] Replace platform::Place to pten::Place. (#38899)

* add pten::Place data structure. * update ci problem * fix ci problem * update * using platform::Place=pten::Place * remove BOOST_GET_CONST for CPUPlace and GPUPlace * compile pass 25%. * compile pass 45% * compile pass 60% * remove boost_get for xpu npu mlu and ipu * compile pass on cpu and gpu. * fix compile problem * fix compile error. * update * fix ci problem * update * ci approve * fix ci problem * fix ci eager test problem * remove BOOST_GET_CONST * fix npu compile

[Pten] Replace platform::Place to pten::Place. (#38899)
* add pten::Place data structure. * update ci problem * fix ci problem * update * using platform::Place=pten::Place * remove BOOST_GET_CONST for CPUPlace and GPUPlace * compile pass 25%. * compile pass 45% * compile pass 60% * remove boost_get for xpu npu mlu and ipu * compile pass on cpu and gpu. * fix compile problem * fix compile error. * update * fix ci problem * update * ci approve * fix ci problem * fix ci eager test problem * remove BOOST_GET_CONST * fix npu compile
c48a9ad5 · Wilber · GitHub · 1dbc8632 · c48a9ad5 · c48a9ad5
282 changed file
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -221,8 +221,8 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(place)) {
      if (framework::IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new framework::UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
+                                                              max_memory_size));
      }
    }
 #endif

--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -106,13 +106,12 @@ void SerializeLodTensor(framework::Variable* var,
    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
  } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -148,13 +147,12 @@ void SerializeSelectedRows(framework::Variable* var,
    iobuf->append(reinterpret_cast<const char*>(tensor->data()), data_len);
  } else {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
    auto data_len = tensor->numel() * framework::SizeOfType(tensor->type());
    iobuf->append(reinterpret_cast<const char*>(&data_len), 8);
@@ -204,7 +202,7 @@ void DeserializeFromMultiVarMsgAndIOBuf(const MultiVarMsg& multi_msg,
 }

 void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,
-                          butil::IOBufBytesIterator& io_buffer_itr,
+                          butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
                          const platform::DeviceContext& ctx) {
  const auto place = ctx.GetPlace();
  framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
@@ -229,30 +227,30 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg,

  // IO Buffer
  if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
    io_buffer_itr.copy_and_forward(tensor_data, data_len);
  } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    unsigned long data_len;
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
-    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);
+    unsigned long data_len;  // NOLINT
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
+    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);         // NOLINT
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), (void*)temp_ptr,
-                 tensor->numel() * framework::SizeOfType(tensor->type()),
-                 stream);
+    memory::Copy(
+        place, tensor_data, platform::CPUPlace(), (void*)temp_ptr,  // NOLINT
+        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
    delete[] temp_ptr;
 #endif
  }
 }

-void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
-                             butil::IOBufBytesIterator& io_buffer_itr,
-                             const platform::DeviceContext& ctx) {
+void DeserializeSelectedRows(
+    framework::Variable* var, const VarMsg& msg,
+    butil::IOBufBytesIterator& io_buffer_itr,  // NOLINT
+    const platform::DeviceContext& ctx) {
  const auto place = ctx.GetPlace();
  auto* slr = var->GetMutable<framework::SelectedRows>();
  framework::Tensor* tensor = slr->mutable_value();
@@ -269,20 +267,19 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg,
      tensor->mutable_data(place, VarMessageToVarType(msg.data_type()));
  // IO Buffer
  if (platform::is_cpu_place(place)) {
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    unsigned long data_len;                                 // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);  // NOLINT
    io_buffer_itr.copy_and_forward(tensor_data, data_len);
  } else if (platform::is_gpu_place(place)) {
 #ifdef PADDLE_WITH_CUDA
-    char* temp_ptr =
-        new char[tensor->numel() * framework::SizeOfType(tensor->type())];
-    unsigned long data_len;
-    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);
+    char* temp_ptr = new char[tensor->numel() *
+                              framework::SizeOfType(tensor->type())];  // NOLINT
+    unsigned long data_len;                                            // NOLINT
+    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);             // NOLINT
    io_buffer_itr.copy_and_forward(temp_ptr, data_len);
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), temp_ptr,
+    memory::Copy(place, tensor_data, platform::CPUPlace(), temp_ptr,
                 tensor->numel() * framework::SizeOfType(tensor->type()),
                 stream);
    delete[] temp_ptr;

--- a/paddle/fluid/distributed/service/heter_client.cc
+++ b/paddle/fluid/distributed/service/heter_client.cc
@@ -44,8 +44,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    memory::Copy(
-        platform::CPUPlace(), temp_ptr,
-        BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(),
+        platform::CPUPlace(), temp_ptr, tensor->place(), tensor->data(),
        tensor->numel() * framework::SizeOfType(tensor->type()), stream);
    float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
    micro_id = static_cast<int>(temp_ptr_float[0]);

--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -43,7 +43,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  TensorAddFunctor(int64_t numel, const T* x, T* y)
      : numel_(numel), x_(x), y_(y) {}

-  void operator()(const paddle::platform::CPUPlace& place) {
+  void operator()(const paddle::platform::CPUPlace& place) const {
    paddle::platform::CPUDeviceContext* ctx =
        dynamic_cast<paddle::platform::CPUDeviceContext*>(
            paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -56,7 +56,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 // TODO(jiabin): Support xpu here from gradient_accumulator.cc

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
    paddle::platform::CUDADeviceContext* ctx =
        dynamic_cast<paddle::platform::CUDADeviceContext*>(
            paddle::platform::DeviceContextPool::Instance().Get(place));
@@ -66,7 +66,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
    blas.AXPY(numel_, 1., x_, y_);
  }
 #else
-  void operator()(const paddle::platform::CUDAPlace& place) {
+  void operator()(const paddle::platform::CUDAPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -76,7 +76,7 @@ class TensorAddFunctor : public boost::static_visitor<> {

  // TODO(jiabin): Support Npu here from gradient_accumulator.cc
  // there is NO blas in CUDAPinnedPlace
-  void operator()(const paddle::platform::CUDAPinnedPlace& place) {
+  void operator()(const paddle::platform::CUDAPinnedPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -84,14 +84,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }

 #ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
        place));
  }
 #else
-  void operator()(const paddle::platform::NPUPlace& place) {
+  void operator()(const paddle::platform::NPUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -100,14 +100,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #ifdef PADDLE_WITH_XPU
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
    paddle::platform::XPUDeviceContext* ctx =
        dynamic_cast<paddle::platform::XPUDeviceContext*>(
            paddle::platform::DeviceContextPool::Instance().Get(place));
    xpu::add<T>(ctx->x_context(), x_, y_, y_, static_cast<int>(numel_));
  }
 #else
-  void operator()(const paddle::platform::XPUPlace& place) {
+  void operator()(const paddle::platform::XPUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -116,14 +116,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #ifdef PADDLE_WITH_MLU
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
        place));
  }
 #else
-  void operator()(const paddle::platform::MLUPlace& place) {
+  void operator()(const paddle::platform::MLUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -132,14 +132,14 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #ifdef PADDLE_WITH_IPU
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
        place));
  }
 #else
-  void operator()(const paddle::platform::IPUPlace& place) {
+  void operator()(const paddle::platform::IPUPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -147,7 +147,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

-  void operator()(const paddle::platform::NPUPinnedPlace& place) {
+  void operator()(const paddle::platform::NPUPinnedPlace& place) const {
    PADDLE_THROW(paddle::platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -157,7 +157,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 private:
  int64_t numel_;
  const T* x_;
-  T* y_;
+  mutable T* y_;
 };

 template <typename DeviceContext, typename T>
@@ -218,7 +218,7 @@ void TensorAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
  if (data_type == paddle::framework::DataTypeTrait<cpp_type>::DataType()) { \
    TensorAddFunctor<cpp_type> func(numel, src_tensor->data<cpp_type>(),     \
                                    dst_tensor->mutable_data<cpp_type>());   \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
    return;                                                                  \
  }

@@ -294,7 +294,7 @@ void VariableAdd(const egr::EagerTensor& src, egr::EagerTensor* dst) {
    TensorAddFunctor<cpp_type> func(                                         \
        numel, src_tensor.data<cpp_type>(),                                  \
        dst_tensor->mutable_data<cpp_type>(place));                          \
-    boost::apply_visitor(func, place);                                       \
+    paddle::platform::VisitPlace(place, func);                               \
    return;                                                                  \
  }


--- a/paddle/fluid/eager/legacy/op_runner.cc
+++ b/paddle/fluid/eager/legacy/op_runner.cc
@@ -150,24 +150,21 @@ void RunOp(const std::string& type, const NameTensorMap& ins,
    VLOG(6) << "Get Device id";
    if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::platform::SetDeviceId(
-          BOOST_GET_CONST(paddle::platform::CUDAPlace, place).device);
+      paddle::platform::SetDeviceId(place.device);
 #else
      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with GPU if use CUDAPlace."));
 #endif
    } else if (paddle::platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-      paddle::platform::SetXPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::XPUPlace, place).device);
+      paddle::platform::SetXPUDeviceId(place.device);
 #else
      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
    } else if (paddle::platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      paddle::platform::SetNPUDeviceId(
-          BOOST_GET_CONST(paddle::platform::NPUPlace, place).device);
+      paddle::platform::SetNPUDeviceId(place.device);
 #else
      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with NPU if use NPUPlace."));

--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -116,7 +116,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
  auto& kernels = kernels_iter->second;
  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() ||
       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
       paddle::platform::is_in_xpu_black_list(op.Type()))) {
@@ -129,7 +129,7 @@ PreparedOp PrepareImpl(const NameTensorMap& ins, const NameTensorMap& outs,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing NPU kernel: " << op.Type()
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";

--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -22,7 +22,7 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
          << " dst_place: " << dst_place;

  PADDLE_ENFORCE_NE(
-      in.place().which(), dst_place.which(),
+      in.place().GetType(), dst_place.GetType(),
      platform::errors::Unavailable("Currently, model parallelism is only "
                                    "supported between CPU and CUDA."));


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -15,6 +15,7 @@

 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -181,7 +182,7 @@ void AllReduceOpHandle::AllReduceFunc(
    const framework::proto::VarType::Type &dtype, int64_t numel,
    const std::vector<platform::Place> &places,
    const std::vector<std::string> &out_var_names) {
-  if (is_gpu_place(places[0])) {
+  if (platform::is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
                            platform::errors::InvalidArgument(
@@ -200,7 +201,7 @@ void AllReduceOpHandle::AllReduceFunc(
    PADDLE_THROW(
        platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
-  } else if (is_xpu_place(places[0])) {
+  } else if (platform::is_xpu_place(places[0])) {
 #if defined(PADDLE_WITH_XPU_BKCL)
    PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_,
                            platform::errors::InvalidArgument(
@@ -286,7 +287,7 @@ void AllReduceOpHandle::NCCLAllReduceFunc(
 void AllReduceOpHandle::SyncNCCLAllReduce() {
  if (FLAGS_sync_nccl_allreduce) {
    for (auto &p : places_) {
-      int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+      int dev_id = p.device;
      auto *nccl_ctxs =
          nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
      auto &nccl_ctx = nccl_ctxs->at(dev_id);

--- a/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/bind_threaded_ssa_graph_executor.cc
@@ -46,7 +46,7 @@ BindThreadedSSAGraphExecutor::BindThreadedSSAGraphExecutor(
  }
  int index = 0;
  for (uint32_t i = 0; i < places.size(); i++) {
-    int id = BOOST_GET_CONST(platform::XPUPlace, places_[i]).device;
+    int id = places_[i].device;
    if (place_to_index_.find(id) == place_to_index_.end()) {
      place_to_index_[id] = index;
      index++;
@@ -145,8 +145,7 @@ FetchResultType BindThreadedSSAGraphExecutor::RunMainStream(
      RunMultiDeviceOpAsync(cur_op, op_deps.get(), ready_ops);
      continue;
    } else {
-      cur_place =
-          BOOST_GET_CONST(platform::XPUPlace, dev_ctxes_.begin()->first);
+      cur_place = dev_ctxes_.begin()->first;
      int cur_index = place_to_index_[cur_place.device];
      RunOpAsyncMainStream(cur_op, op_deps.get(), ready_ops, cur_index);
    }

--- a/paddle/fluid/framework/details/bkcl_op_handle.h
+++ b/paddle/fluid/framework/details/bkcl_op_handle.h
@@ -85,7 +85,7 @@ class BKCLOpHandleBase : public OpHandleBase {
        platform::errors::InvalidArgument(
            "The argument run_order_ must be >= 0, but got %d.", run_order_));
    auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    int dev_id = place.device;
    auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id);
    auto comm = bkcl_ctx.comm_;


--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -16,6 +16,7 @@

 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@@ -83,8 +84,7 @@ void BroadcastOpHandle::BroadcastOneVar(
  } else if (platform::is_gpu_place(in_tensor.place())) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    VarHandle *out_handle = nullptr;
-    int root_id =
-        BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
+    int root_id = in_tensor.place().device;
    std::vector<std::function<void()>> broadcast_calls;

    int type = platform::ToNCCLDataType(in_tensor.type());
@@ -94,8 +94,7 @@ void BroadcastOpHandle::BroadcastOneVar(
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
                              ->FindVar(out_var_handle->name());

-      int dst_id =
-          BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place()).device;
+      int dst_id = out_var_handle->place().device;

      auto &nccl_ctx = nccl_ctxs_->at(dst_id);

@@ -145,7 +144,7 @@ void BroadcastOpHandle::BroadcastOneVar(
  } else {
 #if defined(PADDLE_WITH_XPU_BKCL)
    VarHandle *out_handle = nullptr;
-    int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device;
+    int root_id = in_tensor.place().device;
    std::vector<std::function<void()>> broadcast_calls;

    int type = platform::ToBKCLDataType(in_tensor.type());
@@ -155,8 +154,7 @@ void BroadcastOpHandle::BroadcastOneVar(
      Variable *out_var = var_scopes.at(out_var_handle->scope_idx())
                              ->FindVar(out_var_handle->name());

-      int dst_id =
-          BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device;
+      int dst_id = out_var_handle->place().device;

      auto &bkcl_ctx = bkcl_ctxs_->at(dst_id);

@@ -232,7 +230,7 @@ void BroadcastOpHandle::InitOutputValue(
    PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
                                         "Variable %s is not found in scopes.",
                                         out_var_handle->name()));
-    if (is_gpu_place(in_tensor.place())) {
+    if (platform::is_gpu_place(in_tensor.place())) {
      PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
                        platform::errors::PreconditionNotMet(
                            "Places of input and output must be all on GPU."));

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -46,8 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
-      platform::CUDADeviceGuard guard(
-          BOOST_GET_CONST(platform::CUDAPlace, place).device);
+      platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
      PADDLE_ENFORCE_GPU_SUCCESS(
          hipEventCreateWithFlags(&event_, hipEventDisableTiming));
@@ -72,7 +71,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (event_) {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
+    auto gpu_place = dev_ctx_->GetPlace();
    platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
@@ -85,8 +84,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {

 void EagerDeletionOpHandle::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int dev_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
+  int dev_id = dev_ctxes_.begin()->first.device;
  events_[dev_id] = nullptr;
 #endif
 }

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,6 +16,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(skip_fused_all_reduce_check, false, "");
@@ -102,7 +103,7 @@ void FusedAllReduceOpHandle::RunImpl() {
  gpuStream_t compute_stream{nullptr};

  if (FLAGS_allreduce_record_one_event) {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]);
+    auto gpu_place = platform::CUDAPlace(places_[0].GetDeviceId());
    compute_stream =
        platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream();
    auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
@@ -291,7 +292,7 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
          var, platform::errors::NotFound(
                   "The variable '%s' is not found in local scope.", var_name));
      auto &lod_tensor = var->Get<LoDTensor>();
-      if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
+      if (!platform::is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
        return true;
      }
    }

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -354,7 +354,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,

    float* cpu_data = new float[tensor->numel()];
    memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data),
-                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+                 tensor->place(),
                 static_cast<const void*>(tensor->data<float>()),
                 tensor->numel() * sizeof(float));
    bool flag = false;

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -132,7 +132,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(

  auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
      platform::DeviceContextPool::Instance().Get(tensor_.place()));
-  int dev_id = BOOST_GET_CONST(platform::CUDAPlace, tensor_.place()).device;
+  int dev_id = tensor_.place().device;
  PADDLE_ENFORCE_EQ(
      (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true,
      platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",

--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -102,7 +102,7 @@ class NCCLOpHandleBase : public OpHandleBase {
    }

    for (auto& p : dev_ctxes_) {
-      int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+      int dev_id = p.first.device;
      if (inter_events_.find(dev_id) != inter_events_.end()) {
        continue;
      }
@@ -133,7 +133,7 @@ class NCCLOpHandleBase : public OpHandleBase {
        platform::errors::InvalidArgument(
            "The argument run_order_ must be >= 0, but got %d.", run_order_));
    auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();
    auto comm = nccl_ctx.comm_;
@@ -181,7 +181,7 @@ class NCCLOpHandleBase : public OpHandleBase {
  void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff,
                   size_t count, ncclDataType_t datatype, ncclRedOp_t op) {
    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    auto& nccl_ctx = nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();
    auto comm = nccl_ctx.comm_;
@@ -213,7 +213,7 @@ class NCCLOpHandleBase : public OpHandleBase {
    PADDLE_ENFORCE_NOT_NULL(
        nccl_ctxs_, platform::errors::NotFound(
                        "Can't get exter %d nccl contexts.", run_order_));
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    auto& nccl_ctx = nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();
    auto comm = nccl_ctx.comm_;
@@ -246,7 +246,7 @@ class NCCLOpHandleBase : public OpHandleBase {
  void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
                      ncclDataType_t datatype, ncclRedOp_t op) {
    auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    auto& nccl_ctx = nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();
    auto comm = nccl_ctx.comm_;

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,7 +47,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 void OpHandleBase::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  for (auto &p : dev_ctxes_) {
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+    int dev_id = p.first.device;
    platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_GPU_SUCCESS(
@@ -61,9 +61,7 @@ void OpHandleBase::InitCUDA() {
    for (auto &out_var : outputs_) {
      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
      if (out_var_handle) {
-        int dev_id =
-            BOOST_GET_CONST(platform::CUDAPlace, out_var_handle->place())
-                .device;
+        int dev_id = out_var_handle->place().device;
        out_var_handle->SetGenerateEvent(events_.at(dev_id));
      }
    }
@@ -74,7 +72,7 @@ void OpHandleBase::InitCUDA() {
            "Operator %s should have only one dev_ctx, but got %d.", Name(),
            dev_ctxes_.size()));
    auto &place = dev_ctxes_.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    for (auto &out_var : outputs_) {
      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
      if (out_var_handle) {
@@ -109,7 +107,7 @@ void OpHandleBase::InitXPU() {
                      platform::errors::InvalidArgument(
                          "%s should have only one dev_ctx.", Name()));
    auto &place = dev_ctxes_.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+    int dev_id = place.device;
    platform::SetXPUDeviceId(dev_id);
    for (auto &out_var : outputs_) {
      auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
@@ -309,7 +307,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (!events_.empty()) {  // Use event
    for (auto &p : dev_ctxes_) {
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
+      auto dev_id = p.first.device;
      auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
      VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
@@ -332,8 +330,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
  } else {
    auto *ctx = dev_ctxes_.at(p);
    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
-    cuda_ctx->RecordEvent(
-        events_.at(BOOST_GET_CONST(platform::CUDAPlace, p).device), callback);
+    cuda_ctx->RecordEvent(events_.at(p.device), callback);
  }
 #else
  callback();

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -45,7 +45,7 @@ static std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
  for (auto &op : op_handles) {
    auto &dev_ctx = op->DeviceContext();
    auto &p = dev_ctx.begin()->first;
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+    int dev_id = p.device;
    auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
    graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());


--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"

 PADDLE_DEFINE_EXPORTED_bool(
@@ -125,7 +126,8 @@ void ReduceOpHandle::RunImpl() {

      // TODO(gongwb): add cpu support
      if (collective_context.endpoints_.size() <= 1 ||
-          is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) {
+          platform::is_cpu_place(in_places[0]) ||
+          platform::is_cpu_place(t_out_p)) {
        GatherLocalSelectedRowsFunctor functor(
            in_selected_rows, in_places, dev_ctxes_, t_out_p,
            out_var->GetMutable<framework::SelectedRows>());
@@ -172,13 +174,13 @@ void ReduceOpHandle::RunImpl() {
          out_var_handle->place(), pre_in.type());

      auto out_p = out_var_handle->place();
-      int root_id = BOOST_GET_CONST(platform::CUDAPlace, out_p).device;
+      int root_id = out_p.device;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < var_scopes.size(); ++i) {
        auto &p = in_places[i];
        auto &lod_tensor = *lod_tensors[i];

-        int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p).device;
+        int dev_id = p.device;
        auto &nccl_ctx = nccl_ctxs_->at(dev_id);

        void *buffer = const_cast<void *>(lod_tensor.data());
@@ -218,13 +220,13 @@ void ReduceOpHandle::RunImpl() {
          out_var_handle->place(), pre_in.type());

      auto out_p = out_var_handle->place();
-      int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device;
+      int root_id = out_p.device;
      std::vector<std::function<void()>> all_reduce_calls;
      for (size_t i = 0; i < var_scopes.size(); ++i) {
        auto &p = in_places[i];
        auto &lod_tensor = *lod_tensors[i];

-        int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device;
+        int dev_id = p.device;
        auto &bkcl_ctx = bkcl_ctxs_->at(dev_id);

        void *buffer = const_cast<void *>(lod_tensor.data());

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -61,8 +61,8 @@ struct ScaleLossGradFunctor {
    } else if (platform::is_xpu_place(place_)) {
 #if defined(PADDLE_WITH_XPU)
      OutT cast_coeff = static_cast<OutT>(coeff_);
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data,
-                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_));
+      memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
+                   SizeOfType(out_dtype_));
      VLOG(10) << place_ << "RUN Scale loss grad op";
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(
@@ -73,9 +73,8 @@ struct ScaleLossGradFunctor {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      OutT cast_coeff = static_cast<OutT>(coeff_);
      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
-                   platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_),
-                   stream);
+      memory::Copy(place_, out_data, platform::CPUPlace(), &cast_coeff,
+                   SizeOfType(out_dtype_), stream);
      VLOG(10) << place_ << "RUN Scale loss grad op";
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -86,8 +86,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(

 void ShareTensorBufferOpHandle::InitCUDA() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  int dev_id =
-      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
+  int dev_id = dev_ctxes_.begin()->first.device;
  events_[dev_id] = nullptr;
 #endif
 }

--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -165,7 +165,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
                          in_numel));
    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;

-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    int dev_id = place.device;
    auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
    auto &nccl_ctx = nccl_ctxs->at(dev_id);
    auto stream = nccl_ctx.stream();

--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -106,9 +106,12 @@ struct EnforceShapeAndDTypeEQVisitor {

  void operator()(const LoDTensor& src) {
    auto& tensor = dst_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
-                      platform::errors::PreconditionNotMet(
-                          "The place type of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.place().GetType(), tensor.place().GetType(),
+        platform::errors::PreconditionNotMet(
+            "The place type of the two variables is not equal. The src place "
+            "is %s, but the dst place is %s",
+            src.place().DebugString(), tensor.place().DebugString()));
    PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
                      platform::errors::PreconditionNotMet(
                          "The dtype of the two variables is not equal."));
@@ -127,9 +130,12 @@ struct EnforceShapeAndDTypeEQVisitor {

  void operator()(const SelectedRows& src) {
    auto& selected_rows = dst_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
-                      platform::errors::PreconditionNotMet(
-                          "The place type of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.place().GetType(), selected_rows.place().GetType(),
+        platform::errors::PreconditionNotMet(
+            "The place type of the two variables is not equal. The src place "
+            "is %s, but the dst place is %s",
+            src.place().DebugString(), selected_rows.place().DebugString()));
    PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
                      platform::errors::PreconditionNotMet(
                          "The dtype of the two variables is not equal."));

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -138,7 +138,7 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) {

  // init device, DLDevice type with device_type and device_id
  auto place = tensor.place();
-  t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place);
+  t_.device = paddle::platform::VisitPlace(place, internal::DLDeviceVisitor());

  // init dtype
  t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type());

--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -63,8 +63,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) {
    CHECK_EQ(0, dl_tensor.device.device_id);
  } else if (platform::is_gpu_place(place)) {
    CHECK_EQ(kDLGPU, dl_tensor.device.device_type);
-    CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device,
-             dl_tensor.device.device_id);
+    CHECK_EQ(place.device, dl_tensor.device.device_id);
  } else if (platform::is_cuda_pinned_place(place)) {
    CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type);
    CHECK_EQ(0, dl_tensor.device.device_id);

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -72,7 +72,7 @@ Executor::~Executor() {
 #ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_, this);
+  platform::ClearMKLDNNCache(place_, this);
 #endif
 }

@@ -443,31 +443,26 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
    if (platform::is_gpu_place(place_)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
      } else {
-        gc.reset(new DefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new DefaultStreamGarbageCollector(place_, max_memory_size));
      }
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
 #endif
    } else if (platform::is_cpu_place(place_)) {
-      gc.reset(new CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(place_, max_memory_size));
    } else if (platform::is_xpu_place(place_)) {
 #ifdef PADDLE_WITH_XPU
-      gc.reset(new XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
+      gc.reset(new XPUGarbageCollector(place_, max_memory_size));
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
 #endif
    } else if (platform::is_ipu_place(place_)) {
 #ifdef PADDLE_WITH_IPU
-      gc.reset(new IPUGarbageCollector(
-          BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
+      gc.reset(new IPUGarbageCollector(place_, max_memory_size));
 #else
      PADDLE_THROW(
          platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
@@ -476,16 +471,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #ifdef PADDLE_WITH_ASCEND_CL
      if (IsFastEagerDeletionModeEnabled()) {
        VLOG(4) << "Use unsafe fast gc for NPU.";
-        gc.reset(new NPUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+        gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
      } else {
        PADDLE_THROW(platform::errors::Unimplemented(
            "Please set FLAGS_fast_eager_deletion_mode=true to use "
            "GarbageCollector on NPU."));
        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
        VLOG(4) << "Use default stream gc for NPU.";
-        gc.reset(new NPUDefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+        gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
      }
 #else
      PADDLE_THROW(
@@ -494,11 +487,9 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
    } else if (platform::is_mlu_place(place_)) {
 #ifdef PADDLE_WITH_MLU
      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
+        gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
      } else {
-        gc.reset(new MLUDefaultStreamGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place_), max_memory_size));
+        gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
      }
 #else
      PADDLE_THROW(

--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -137,8 +137,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                             const int expand_embed_dim,
                             const int64_t total_length) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
@@ -203,8 +202,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                          uint64_t** origin_keys, uint64_t* total_keys,
                          const int64_t* gpu_len, int slot_num, int total_len) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
 #ifdef PADDLE_WITH_HIP
  hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
@@ -225,8 +223,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
                             const int hidden_size, const int expand_embed_dim,
                             const int64_t total_length, const int batch_size) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto slot_lengths_lod = slot_lengths;
  for (int i = 1; i < slot_lengths_lod.size(); i++) {

--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -45,7 +45,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
  } else if (platform::is_gpu_place(place)) {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
    LoDTensor& total_keys_tensor = keys_tensor[device_id];
    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
        total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
@@ -131,7 +131,7 @@ void BoxWrapper::PushSparseGradCase(
        "Warning:: CPUPlace is not supported in PaddleBox now."));
  } else if (platform::is_gpu_place(place)) {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
    LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
    uint64_t* total_keys =
        reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
@@ -143,8 +143,7 @@ void BoxWrapper::PushSparseGradCase(
    push_boxps_timer.Start();
    int ret = boxps_ptr_->PushSparseGPU(
        total_keys, reinterpret_cast<void*>(total_grad_values_gpu),
-        static_cast<int>(total_length),
-        BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId());
+        static_cast<int>(total_length), place.GetDeviceId());
    PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
                                  "PushSparseGPU failed in BoxPS."));
    push_boxps_timer.Pause();

--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -764,8 +764,7 @@ void FleetWrapper::PushDenseVarsAsync(
    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
    float* pin_g = pin_tensor->mutable_data<float>(tensor->dims(),
                                                   platform::CUDAPinnedPlace());
-    memory::Copy(platform::CUDAPinnedPlace(), pin_g,
-                 BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
+    memory::Copy(platform::CUDAPinnedPlace(), pin_g, place, g_data,
                 sizeof(float) * count, stream);
 #ifdef PADDLE_WITH_HIP
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
@@ -821,8 +820,7 @@ void FleetWrapper::PushDenseVarsAsync(
    LoDTensor* pin_tensor = pin_var->GetMutable<LoDTensor>();
    float* pin_g =
        pin_tensor->mutable_data<float>(tensor->dims(), platform::CPUPlace());
-    memory::Copy(platform::CPUPlace(), pin_g,
-                 BOOST_GET_CONST(platform::XPUPlace, place), g_data,
+    memory::Copy(platform::CPUPlace(), pin_g, place, g_data,
                 sizeof(float) * count);

    float* g = pin_g;

--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -116,14 +116,12 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
           tensor->numel() * SizeOfType(tensor->type()));
  } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    memory::Copy(platform::CPUPlace(), data_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
+    memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()),
                 nullptr);
 #endif
 #ifdef PADDLE_WITH_XPU
-    memory::Copy(platform::CPUPlace(), data_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
+    memory::Copy(platform::CPUPlace(), data_ptr, tensor->place(),
                 tensor->data(), tensor->numel() * SizeOfType(tensor->type()));
 #endif
  }
@@ -158,8 +156,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
      tensor->mutable_data(place, ToVarType(req_var.data_type()));

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-               platform::CPUPlace(), req_var.data().data(),
+  memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
               tensor->numel() * SizeOfType(tensor->type()), stream);
 #else
  memcpy(tensor_data, req_var.data().data(),
@@ -197,8 +194,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
      tensor->mutable_data(place, ToVarType(req_var.data_type()));

 #ifdef PADDLE_WITH_XPU
-  memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data,
-               platform::CPUPlace(), req_var.data().data(),
+  memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(),
               tensor->numel() * SizeOfType(tensor->type()));
 #else
  memcpy(tensor_data, req_var.data().data(),

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -791,7 +791,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
        "Warning:: CPUPlace is not supported in GpuPs now."));
  } else if (platform::is_gpu_place(place)) {
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
    LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
    uint64_t* total_keys = reinterpret_cast<uint64_t*>(
@@ -859,7 +859,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
    PADDLE_THROW(platform::errors::Unimplemented(
        "Warning:: CPUPlace is not supported in GPUPS now."));
  } else if (platform::is_gpu_place(place)) {
-    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
+    int device_id = place.GetDeviceId();
    int devid_2_index = HeterPs_->get_index_by_devid(device_id);
    LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
    uint64_t* total_keys =

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -113,8 +113,7 @@ void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
                               const int hidden_size,
                               const int64_t total_length) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
@@ -132,8 +131,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                            const int64_t* gpu_len, int slot_num,
                            int total_len) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
      origin_keys, total_keys, gpu_len, slot_num, total_len);
@@ -148,8 +146,7 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
                               const int64_t total_length,
                               const int batch_size) {
  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(
-                        BOOST_GET_CONST(platform::CUDAPlace, place)))
+                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto slot_lengths_lod = slot_lengths;
  for (int i = 1; i < slot_lengths_lod.size(); i++) {

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -101,7 +101,7 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
 }

 StreamGarbageCollector::~StreamGarbageCollector() {
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
+  auto place = this->dev_ctx_->GetPlace();
  platform::CUDADeviceGuard guard(place.device);
  platform::GpuStreamSync(stream_);
  platform::GpuDestroyStream(stream_);
@@ -186,7 +186,7 @@ MLUStreamGarbageCollector::MLUStreamGarbageCollector(
 }

 MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
-  auto place = BOOST_GET_CONST(platform::MLUPlace, this->dev_ctx_->GetPlace());
+  auto place = this->dev_ctx_->GetPlace();
  platform::MLUDeviceGuard guard(place.device);
  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
  PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));

--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -46,8 +46,8 @@ void SetMicroId(paddle::framework::Scope* scope,
    temp_ptr_float[0] = micro_id;
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
-                 platform::CPUPlace(), reinterpret_cast<void*>(temp_ptr),
+    memory::Copy(place, tensor_data, platform::CPUPlace(),
+                 reinterpret_cast<void*>(temp_ptr),
                 tensor->numel() * framework::SizeOfType(tensor->type()),
                 stream);
 #endif

--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -117,12 +117,12 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) {
 #ifdef PADDLE_WITH_CUDA
  auto stream = copy_streams_[num];
  auto event = events_[num];
-  auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+  auto dev_id = place.device;
  platform::CUDADeviceGuard guard(dev_id);
 #endif

 #ifdef PADDLE_WITH_XPU
-  auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+  auto dev_id = place.device;
  platform::XPUDeviceGuard guard(dev_id);
 #endif

@@ -173,13 +173,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
  T* root_ptr = root_tensor->data<T>();
  if (platform::is_cpu_place(root_tensor->place())) {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
-                 platform::CPUPlace(), root_ptr,
+    memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
                 sizeof(T) * root_tensor->numel(), stream);
  } else {
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr,
-                 BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()),
-                 root_ptr, sizeof(T) * root_tensor->numel(), stream);
+    memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
+                 sizeof(T) * root_tensor->numel(), stream);
  }
 }
 #endif
@@ -193,13 +191,11 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor,
      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
  T* root_ptr = root_tensor->data<T>();
  if (platform::is_cpu_place(root_tensor->place())) {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
-                 platform::CPUPlace(), root_ptr,
+    memory::Copy(thread_place, thread_ptr, platform::CPUPlace(), root_ptr,
                 sizeof(T) * root_tensor->numel());
  } else {
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()),
-                 root_ptr, sizeof(T) * root_tensor->numel());
+    memory::Copy(thread_place, thread_ptr, root_tensor->place(), root_ptr,
+                 sizeof(T) * root_tensor->numel());
  }
 }
 #endif
@@ -286,7 +282,7 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) {
        (context->ops_).push_back(local_op_ptr);
      }
 #ifdef PADDLE_WITH_CUDA
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+      auto dev_id = place.device;
      platform::CUDADeviceGuard guard(dev_id);
      PADDLE_ENFORCE_GPU_SUCCESS(
          cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));
@@ -336,15 +332,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
      _ForEachDataType_(MergeCallback);
      if (!platform::is_cpu_place(thread_tensor->place())) {
 #ifdef PADDLE_WITH_CUDA
-        auto dev_id =
-            BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device;
+        auto dev_id = thread_tensor->place().device;
        platform::CUDADeviceGuard guard(dev_id);
        cudaMemset(thread_tensor->data(), 0,
                   thread_tensor->numel() * SizeOfType(thread_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
        auto place = thread_tensor->place();
-        auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+        auto dev_id = place.device;
        platform::XPUDeviceGuard guard(dev_id);
        platform::DeviceContextPool& pool =
            platform::DeviceContextPool::Instance();
@@ -364,15 +359,14 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request,
                               merge_var);
    if (!platform::is_cpu_place(root_tensor->place())) {
 #ifdef PADDLE_WITH_CUDA
-      auto dev_id =
-          BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device;
+      auto dev_id = root_tensor->place().device;
      platform::CUDADeviceGuard guard(dev_id);
      cudaMemset(root_tensor->data(), 0,
                 root_tensor->numel() * SizeOfType(root_tensor->type()));
 #endif
 #ifdef PADDLE_WITH_XPU
      auto place = root_tensor->place();
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      auto dev_id = place.device;
      platform::XPUDeviceGuard guard(dev_id);
      platform::DeviceContextPool& pool =
          platform::DeviceContextPool::Instance();
@@ -442,7 +436,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
      (context->ops_).push_back(local_op_ptr);
    }
 #ifdef PADDLE_WITH_CUDA
-    auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+    auto dev_id = place.device;
    platform::CUDADeviceGuard guard(dev_id);
    PADDLE_ENFORCE_GPU_SUCCESS(
        cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming));

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -67,7 +67,7 @@ Graph *Pass::Apply(Graph *graph) const {
 #ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // Passes can change params, tensors, so caching need to be discarded
-  ClearMKLDNNCache(paddle::platform::CPUPlace());
+  platform::ClearMKLDNNCache(paddle::platform::CPUPlace());
 #endif
  VLOG(10) << "finish to apply pass " << Type() << " to graph";
  return graph;

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -32,10 +32,8 @@ namespace framework {

 inline paddle::optional<platform::CUDAPlace> OptionalCUDAPlace(
    const paddle::memory::allocation::AllocationPtr &gpu_) {
-  return gpu_ == nullptr
-             ? paddle::none
-             : paddle::optional<platform::CUDAPlace>(
-                   BOOST_GET_CONST(platform::CUDAPlace, gpu_->place()));
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<platform::CUDAPlace>(gpu_->place());
 }

 // Vector<T> implements the std::vector interface, and can get Data or
@@ -369,11 +367,11 @@ class Vector {
  // get cuda ptr. immutable
  const T *CUDAData(platform::Place place) const {
    {
+      platform::CUDAPlace p(place.GetDeviceId());
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == paddle::none ||
-          cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
+      if (cuda_place == paddle::none || cuda_place == p) {
        return m_.Data().CUDAData(place);
      }
    }
@@ -385,11 +383,11 @@ class Vector {
  // get cuda ptr. mutable
  T *CUDAMutableData(platform::Place place) {
    {
+      platform::CUDAPlace p(place.GetDeviceId());
      auto &mtx = m_.Data().Mutex();
      std::lock_guard<std::mutex> guard(mtx);
      auto cuda_place = m_.Data().CUDAPlace();
-      if (cuda_place == paddle::none ||
-          cuda_place == BOOST_GET(platform::CUDAPlace, place)) {
+      if (cuda_place == paddle::none || cuda_place == p) {
        return m_.MutableData()->CUDAMutableData(place);
      }
    }

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -131,7 +131,7 @@ NaiveExecutor::~NaiveExecutor() {
 #ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_, this);
+  platform::ClearMKLDNNCache(place_, this);
 #endif
 }


--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -43,7 +43,7 @@ class ProfilerGuard {
  void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+      auto cuda_place = place;
      cost_info_->device_memory_bytes =
          platform::RecordedGpuMallocSize(cuda_place.device);
 #endif

--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -22,7 +22,7 @@ namespace framework {
 size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
  int cur_loc = 0;

-  int place = key.place_.which();
+  int place = static_cast<int>(key.place_.GetType());
  cur_loc += OpKernelType::kPlaceBits;

  int data_type = static_cast<int>(key.data_type_) << cur_loc;

--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
                              LibraryType::kCUDNN);

  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "data_type[float]:data_layout[NCHW]:place[Place(cpu)]:library_type["
            "CUDNN]");

  using CUDAPlace = paddle::platform::CUDAPlace;
@@ -35,7 +35,7 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
-            "CUDAPlace(0)]:library_"
+            "Place(gpu:0)]:library_"
            "type[CUDNN]");
 }


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -210,7 +210,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
          "reinstall Paddle with CUDA support.",
          place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
+      auto dev_id = place.device;
      platform::SetDeviceId(dev_id);
 #endif
    } else if (platform::is_xpu_place(place)) {
@@ -220,7 +220,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
          "reinstall Paddle with XPU support.",
          place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
+      auto dev_id = place.device;
      platform::SetXPUDeviceId(dev_id);
 #endif
    } else if (platform::is_npu_place(place)) {
@@ -230,7 +230,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
          "reinstall Paddle with NPU support.",
          place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      auto dev_id = place.device;
      platform::SetNPUDeviceId(dev_id);
 #endif
    } else if (platform::is_mlu_place(place)) {
@@ -240,7 +240,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
          "reinstall Paddle with MLU support.",
          place));
 #else
-      auto dev_id = BOOST_GET_CONST(platform::MLUPlace, place).device;
+      auto dev_id = place.device;
      platform::SetMLUDeviceId(dev_id);
 #endif
    }
@@ -1330,7 +1330,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
  }
 #endif
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() ||
       !paddle::platform::is_xpu_support_op(type_, expected_kernel_key) ||
       paddle::platform::is_in_xpu_black_list(type_))) {
@@ -1343,7 +1343,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      platform::is_npu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing NPU kernel: " << type_
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";
@@ -1353,7 +1353,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 #endif
 #ifdef PADDLE_WITH_MLU
  if (kernel_iter == kernels.end() &&
-      is_mlu_place(expected_kernel_key.place_)) {
+      platform::is_mlu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing MLU kernel: " << type_
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -500,11 +500,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place, max_memory_size));
      } else {
-        gc.reset(new StreamGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
+        gc.reset(new StreamGarbageCollector(place, max_memory_size));
      }
      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
@@ -515,11 +513,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
    } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new MLUUnsafeFastGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
+        gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
      } else {
-        gc.reset(new MLUStreamGarbageCollector(
-            BOOST_GET_CONST(platform::MLUPlace, place), max_memory_size));
+        gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
      }
      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
@@ -529,8 +525,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
 #endif
    } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
-      gc.reset(new XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size));
+      gc.reset(new XPUGarbageCollector(place, max_memory_size));
      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(
@@ -538,8 +533,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
          "Please recompile or reinstall Paddle with XPU support."));
 #endif
    } else if (platform::is_cpu_place(place)) {
-      gc.reset(new CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size));
+      gc.reset(new CPUGarbageCollector(place, max_memory_size));
      VLOG(10) << "Created GarbageCollector at " << place;
    } else {
      PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -609,10 +603,9 @@ void InitP2P(const std::vector<platform::Place> &places) {

    std::vector<int> devices;
    for (int i = 0; i < count; i++) {
-      if (!is_gpu_place(places[i])) return;
+      if (!platform::is_gpu_place(places[i])) return;

-      platform::CUDAPlace device =
-          BOOST_GET_CONST(platform::CUDAPlace, places[i]);
+      platform::CUDAPlace device = places[i];
      devices.push_back(device.GetDeviceId());
    }

@@ -655,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                   const BuildStrategy &build_strategy,
                                   ir::Graph *graph)
    : member_(new ParallelExecutorPrivate(places, scope)) {
-  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
-                 platform::errors::Unavailable(
-                     "NPU is not supported in ParallelExecutor"));
+  PADDLE_ENFORCE_EQ(places.size() > 0 && !platform::is_npu_place(places[0]),
+                    true, platform::errors::Unavailable(
+                              "NPU is not supported in ParallelExecutor."));
  InitP2P(places);
  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                 member_->places_.size());

--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -135,13 +135,11 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
        LoDTensor* tensor = var->GetMutable<LoDTensor>();
        float* w = tensor->data<float>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
-                     platform::CUDAPinnedPlace(), pin_w,
+        memory::Copy(places_[i], w, platform::CUDAPinnedPlace(), pin_w,
                     sizeof(float) * tensor->numel(), copy_streams_[i]);
 #endif
 #ifdef PADDLE_WITH_XPU
-        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w,
-                     platform::CPUPlace(), pin_w,
+        memory::Copy(places_[i], w, platform::CPUPlace(), pin_w,
                     sizeof(float) * tensor->numel());
 #endif
      }

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -224,23 +224,20 @@ void SectionWorker::TrainFiles() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(place_)) {
      if (IsFastEagerDeletionModeEnabled()) {
-        gc.reset(new UnsafeFastGPUGarbageCollector(
-            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
+        gc.reset(new UnsafeFastGPUGarbageCollector(place_, max_memory_size));
      }
    }
 #elif defined(PADDLE_WITH_ASCEND_CL)
    if (IsFastEagerDeletionModeEnabled()) {
      VLOG(4) << "Use unsafe fast gc for NPU.";
-      gc.reset(new NPUUnsafeFastGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      gc.reset(new NPUUnsafeFastGarbageCollector(place_, max_memory_size));
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "Please set FLAGS_fast_eager_deletion_mode=true to use "
          "GarbageCollector on NPU."));
      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
      VLOG(4) << "Use default stream gc for NPU.";
-      gc.reset(new NPUDefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      gc.reset(new NPUDefaultStreamGarbageCollector(place_, max_memory_size));
    }
 #endif
  }  // max_memory_size >= 0

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -25,13 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-
-namespace paddle {
-namespace platform {
-class DeviceContext;
-class Place;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/fluid/platform/place.h"

 namespace paddle {
 namespace framework {

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -153,14 +153,12 @@ void TensorFromArray(const T* src, const size_t& array_size,
  auto size = array_size * sizeof(T);

  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
@@ -176,8 +174,7 @@ void TensorFromArray(const T* src, const size_t& array_size,

    //  2. async copy npu pinned tensor -> npu tensor
    memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

    //  3. record event
@@ -205,14 +202,12 @@ void TensorFromVector(const std::vector<T>& src,
  auto size = src.size() * sizeof(T);

  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
@@ -233,8 +228,7 @@ void TensorFromVector(const std::vector<T>& src,

    //  2. async copy npu pinned tensor -> npu tensor
    memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

    //  3. record event
@@ -252,8 +246,7 @@ void TensorFromVector(const std::vector<T>& src,
 #ifdef PADDLE_WITH_MLU
  if (platform::is_mlu_place(dst_place)) {
    memory::Copy(
-        BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
 #endif
@@ -280,14 +273,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
  auto size = src.size() * sizeof(bool);

  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size);
+    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
-        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
+        dst_place, dst_ptr, src_place, src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
@@ -303,8 +294,7 @@ inline void TensorFromVector(const std::vector<bool>& src,

    //  2. async copy npu pinned tensor -> npu tensor
    memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        npu_pinned_place, npu_pinned_ptr, size,
+        dst_place, dst_ptr, npu_pinned_place, npu_pinned_ptr, size,
        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());

    //  3. record event
@@ -362,37 +352,29 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
  auto dst_ptr = static_cast<void*>(dst->data());

  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (platform::is_xpu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-                 size, nullptr);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
 #endif
@@ -412,37 +394,29 @@ inline void TensorToVector(const Tensor& src,
  auto dst_ptr = static_cast<void*>(array);

  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
  else if (platform::is_xpu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::XPUPlace, src.place()), src_ptr,
-                 size);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(dst_place, dst_ptr,
-                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-                 size, nullptr);
+    memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
  }
 #endif
 #ifdef PADDLE_WITH_MLU
  else if (platform::is_mlu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
-        src_ptr, size,
+        dst_place, dst_ptr, src.place(), src_ptr, size,
        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
  }
 #endif
@@ -467,8 +441,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));

-  memory::Copy(dst_place, dst_ptr,
-               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+  memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
 }

 template <>
@@ -488,8 +461,7 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
          "The input tensor should be CPU device, but actually it is in %s.",
          src.place()));

-  memory::Copy(dst_place, dst_ptr,
-               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+  memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);

  for (unsigned int i = 0; i < src.numel(); i++) {
    (*dst)[i] = static_cast<bool>(array[i]);

--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -86,7 +86,7 @@ void BKCLParallelContext::Init() {
  }
  BcastBKCLId(bkcl_ids, 0);

-  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  int xpu_id = place_.device;
  for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
    VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
            << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
@@ -111,7 +111,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
  }
  BcastBKCLId(bkcl_ids, 0);

-  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  int xpu_id = place_.device;
  VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
          << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
          << " ring id: " << ring_id;

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -78,7 +78,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  TensorAddFunctor(int64_t numel, const T* x, T* y)
      : numel_(numel), x_(x), y_(y) {}

-  void operator()(const platform::CPUPlace& place) {
+  void operator()(const platform::CPUPlace& place) const {
    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }

 #ifdef PADDLE_WITH_XPU
-  void operator()(const platform::XPUPlace& place) {
+  void operator()(const platform::XPUPlace& place) const {
    using XPUType = typename XPUTypeTrait<T>::Type;
    platform::XPUDeviceContext* ctx = dynamic_cast<platform::XPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
@@ -100,7 +100,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
                                   r, XPUAPIErrorMsg[r]));
  }
 #else
-  void operator()(const platform::XPUPlace& place) {
+  void operator()(const platform::XPUPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -109,7 +109,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  void operator()(const platform::CUDAPlace& place) {
+  void operator()(const platform::CUDAPlace& place) const {
    platform::CUDADeviceContext* ctx =
        dynamic_cast<platform::CUDADeviceContext*>(
            platform::DeviceContextPool::Instance().Get(place));
@@ -117,7 +117,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
    blas.AXPY(numel_, 1., x_, y_);
  }
 #else
-  void operator()(const platform::CUDAPlace& place) {
+  void operator()(const platform::CUDAPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -126,7 +126,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #ifdef PADDLE_WITH_MLU
-  void operator()(const platform::MLUPlace& place) {
+  void operator()(const platform::MLUPlace& place) const {
    // TODO(fwg): SUPPORT it
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
@@ -134,7 +134,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
        place));
  }
 #else
-  void operator()(const platform::MLUPlace& place) {
+  void operator()(const platform::MLUPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -143,7 +143,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 #endif

 #ifdef PADDLE_WITH_ASCEND_CL
-  void operator()(const platform::NPUPlace& place) {
+  void operator()(const platform::NPUPlace& place) const {
    // TODO(zhiqiu): SUPPORT it
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
@@ -151,7 +151,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
        place));
  }
 #else
-  void operator()(const platform::NPUPlace& place) {
+  void operator()(const platform::NPUPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -159,21 +159,21 @@ class TensorAddFunctor : public boost::static_visitor<> {
  }
 #endif

-  void operator()(const platform::NPUPinnedPlace& place) {
+  void operator()(const platform::NPUPinnedPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
        place));
  }
  // there is NO blas in CUDAPinnedPlace
-  void operator()(const platform::CUDAPinnedPlace& place) {
+  void operator()(const platform::CUDAPinnedPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
        place));
  }
  // there is NO support in IPUPlace
-  void operator()(const platform::IPUPlace& place) {
+  void operator()(const platform::IPUPlace& place) const {
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Gradient accumulation on place (%s) "
        "is not supported in imperative mode",
@@ -183,7 +183,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
 private:
  int64_t numel_;
  const T* x_;
-  T* y_;
+  mutable T* y_;
 };

 #ifdef PADDLE_WITH_XPU
@@ -248,7 +248,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
    TensorAddFunctor<cpp_type> func(                                 \
        numel, src_tensor.data<cpp_type>(),                          \
        dst_tensor->mutable_data<cpp_type>(place));                  \
-    boost::apply_visitor(func, place);                               \
+    platform::VisitPlace(place, func);                               \
    return;                                                          \
  }


--- a/paddle/fluid/imperative/hccl_context.cc
+++ b/paddle/fluid/imperative/hccl_context.cc
@@ -86,7 +86,7 @@ void HCCLParallelContext::Init() {
  }
  BcastHCCLId(hccl_ids, 0, server_fd);

-  int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+  int npu_id = place_.device;
  for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
    VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
            << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
@@ -96,10 +96,10 @@ void HCCLParallelContext::Init() {
        &hccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, npu_id,
        ring_id);

-    compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::NPUPlace, place_).device));
-    comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::NPUPlace, place_).device));
+    compute_events_.emplace_back(
+        platform::NpuEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::NpuEventResourcePool::Instance().New(place_.device));
  }
 }

@@ -117,7 +117,7 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
  }
  BcastHCCLId(hccl_ids, 0, server_fd);

-  int npu_id = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+  int npu_id = place_.device;
  VLOG(0) << "init hccl context nranks: " << strategy_.nranks_
          << " local rank: " << strategy_.local_rank_ << " npu id: " << npu_id
          << " ring id: " << ring_id;
@@ -125,10 +125,10 @@ void HCCLParallelContext::InitWithRingID(int ring_id) {
  platform::HCCLCommContext::Instance().CreateHCCLComm(
      &hccl_ids[0], strategy_.nranks_, strategy_.local_rank_, npu_id, ring_id);

-  compute_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::NPUPlace, place_).device));
-  comm_events_.emplace_back(platform::NpuEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::NPUPlace, place_).device));
+  compute_events_.emplace_back(
+      platform::NpuEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::NpuEventResourcePool::Instance().New(place_.device));
 }

 void HCCLParallelContext::AllReduceByStream(const framework::Variable &src,

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -193,7 +193,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
          grad_var_->MutableVar()->GetMutable<framework::SelectedRows>();
      if (grad_t->mutable_value()->IsInitialized()) {
 #ifdef PADDLE_WITH_MKLDNN
-        if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
+        if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
 #endif
        grad_t->mutable_rows()->clear();
        grad_t->mutable_value()->clear();
@@ -211,7 +211,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
          grad_t->clear();
        }
 #ifdef PADDLE_WITH_MKLDNN
-        if (FLAGS_use_mkldnn) ClearMKLDNNCache(grad_t->place());
+        if (FLAGS_use_mkldnn) platform::ClearMKLDNNCache(grad_t->place());
 #endif
      }
    }

--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -77,7 +77,7 @@ void NCCLParallelContext::Init() {
  }
  BcastNCCLId(nccl_ids, 0, server_fd);

-  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  int gpu_id = place_.device;
  for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
    VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
            << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
@@ -88,10 +88,9 @@ void NCCLParallelContext::Init() {
        ring_id);

    compute_events_.emplace_back(
-        platform::CudaEventResourcePool::Instance().New(
-            BOOST_GET_CONST(platform::CUDAPlace, place_).device));
-    comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-        BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+        platform::CudaEventResourcePool::Instance().New(place_.device));
+    comm_events_.emplace_back(
+        platform::CudaEventResourcePool::Instance().New(place_.device));
  }
 }

@@ -111,7 +110,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
  }
  BcastNCCLId(nccl_ids, 0, server_fd);

-  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  int gpu_id = place_.device;
  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
          << " ring id: " << ring_id;
@@ -119,10 +118,10 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
  platform::NCCLCommContext::Instance().CreateComm(
      &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);

-  compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
-  comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
-      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+  compute_events_.emplace_back(
+      platform::CudaEventResourcePool::Instance().New(place_.device));
+  comm_events_.emplace_back(
+      platform::CudaEventResourcePool::Instance().New(place_.device));
 }

 void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -194,7 +194,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
  auto& kernels = kernels_iter->second;
  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
+  if (paddle::platform::is_xpu_place(expected_kernel_key.place_) &&
      (kernel_iter == kernels.end() ||
       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
       paddle::platform::is_in_xpu_black_list(op.Type()))) {
@@ -207,7 +207,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_npu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing NPU kernel: " << op.Type()
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";
@@ -217,7 +217,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 #ifdef PADDLE_WITH_MLU
  if (kernel_iter == kernels.end() &&
-      is_mlu_place(expected_kernel_key.place_)) {
+      paddle::platform::is_mlu_place(expected_kernel_key.place_)) {
    VLOG(3) << "missing MLU kernel: " << op.Type()
            << ", expected_kernel_key:" << expected_kernel_key
            << ", fallbacking to CPU one!";

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -835,7 +835,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
    // thrown in comm_pool_.
    auto next_group = next_group_;
    comm_pool_->enqueue([this, run_order, next_group, &group] {
-      auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+      auto dev_id = place_.device;
      platform::SetXPUDeviceId(dev_id);
      FusedAllReduceSchedule(run_order, group, next_group);
      {

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -87,8 +87,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
    std::unique_ptr<framework::GarbageCollector> gc;
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      gc.reset(new framework::DefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::CUDAPlace, place), 0));
+      gc.reset(new framework::DefaultStreamGarbageCollector(place, 0));

      VLOG(10) << "Created GarbageCollector at " << place;
 #else
@@ -98,8 +97,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
    } else if (platform::is_cuda_pinned_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      gc.reset(new framework::CUDAPinnedGarbageCollector(
-          BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
+      gc.reset(new framework::CUDAPinnedGarbageCollector(place, 0));

      VLOG(10) << "Created GarbageCollector at " << place;
 #else
@@ -110,8 +108,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
    } else if (platform::is_xpu_place(place)) {
 #if defined(PADDLE_WITH_XPU)
-      gc.reset(new framework::XPUGarbageCollector(
-          BOOST_GET_CONST(platform::XPUPlace, place), 0));
+      gc.reset(new framework::XPUGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(
@@ -119,14 +116,12 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
          "Please recompile or reinstall Paddle with XPU support."));
 #endif
    } else if (platform::is_cpu_place(place)) {
-      gc.reset(new framework::CPUGarbageCollector(
-          BOOST_GET_CONST(platform::CPUPlace, place), 0));
+      gc.reset(new framework::CPUGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
    } else if (platform::is_npu_place(place)) {
 #if defined(PADDLE_WITH_ASCEND_CL)
      // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
-      gc.reset(new framework::NPUUnsafeFastGarbageCollector(
-          BOOST_GET_CONST(platform::NPUPlace, place), 0));
+      gc.reset(new framework::NPUUnsafeFastGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(
@@ -135,8 +130,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 #endif
    } else if (platform::is_mlu_place(place)) {
 #if defined(PADDLE_WITH_MLU)
-      gc.reset(new framework::MLUDefaultStreamGarbageCollector(
-          BOOST_GET_CONST(platform::MLUPlace, place), 0));
+      gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
      VLOG(10) << "Created GarbageCollector at " << place;
 #else
      PADDLE_THROW(platform::errors::PermissionDenied(
@@ -197,31 +191,28 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
  try {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
+      platform::SetDeviceId(place.device);
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with GPU if use CUDAPlace."));
 #endif
    } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-      platform::SetXPUDeviceId(
-          BOOST_GET_CONST(platform::XPUPlace, place).device);
+      platform::SetXPUDeviceId(place.device);
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with XPU if use XPUPlace."));
 #endif
    } else if (platform::is_npu_place(place)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      platform::SetNPUDeviceId(
-          BOOST_GET_CONST(platform::NPUPlace, place).device);
+      platform::SetNPUDeviceId(place.device);
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with NPU if use NPUPlace."));
 #endif
    } else if (platform::is_mlu_place(place)) {
 #ifdef PADDLE_WITH_MLU
-      platform::SetMLUDeviceId(
-          BOOST_GET_CONST(platform::MLUPlace, place).device);
+      platform::SetMLUDeviceId(place.device);
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
          "PaddlePaddle should compile with MLU if use MLUPlace."));

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -127,7 +127,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto *dev_ctx =
        static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-    auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place);
+    auto dst_gpu_place = place;
    memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                 platform::CPUPlace(), pt.data.data(), pt.data.length(),
                 dev_ctx->stream());
@@ -137,7 +137,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t,
 #endif
  } else if (platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place);
+    auto dst_xpu_place = place;
    memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                 platform::CPUPlace(), pt.data.data(), pt.data.length());
 #else
@@ -954,14 +954,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
      // model.
      res->SetPlace(PaddlePlace::kCPU);
    } else {
-      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto xpu_place = place_;
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
    }
  } else if (platform::is_npu_place(place_)) {
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    auto npu_place = place_;
    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
  } else {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+    auto gpu_place = place_;
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
  return res;
@@ -993,14 +993,14 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
      // model.
      res->SetPlace(PaddlePlace::kCPU);
    } else {
-      auto xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto xpu_place = place_;
      res->SetPlace(PaddlePlace::kXPU, xpu_place.GetDeviceId());
    }
  } else if (platform::is_npu_place(place_)) {
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+    auto npu_place = place_;
    res->SetPlace(PaddlePlace::kNPU, npu_place.GetDeviceId());
  } else {
-    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+    auto gpu_place = place_;
    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
  }
  return res;
@@ -1050,7 +1050,7 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
  if (stream != nullptr) {
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto gpu_place = place_;
    auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
        pool.Get(gpu_place));
    dev_ctx->SetThreadLocalStream(stream);
@@ -1065,7 +1065,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto gpu_place = place_;
    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
        pool.Get(gpu_place));
 #ifdef PADDLE_WITH_HIP

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -243,7 +243,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
-      auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
+      auto dst_gpu_place = place_;
      memory::Copy(dst_gpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());
@@ -253,7 +253,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #endif
    } else if (platform::is_xpu_place(place_)) {
 #ifdef PADDLE_WITH_XPU
-      auto dst_xpu_place = BOOST_GET_CONST(platform::XPUPlace, place_);
+      auto dst_xpu_place = place_;
      memory::Copy(dst_xpu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length());
@@ -267,7 +267,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
          platform::DeviceContextPool::Instance();
      auto *dev_ctx =
          static_cast<const platform::NPUDeviceContext *>(pool.Get(place_));
-      auto dst_npu_place = BOOST_GET_CONST(platform::NPUPlace, place_);
+      auto dst_npu_place = place_;
      memory::Copy(dst_npu_place, static_cast<void *>(input_ptr),
                   platform::CPUPlace(), inputs[i].data.data(),
                   inputs[i].data.length(), dev_ctx->stream());

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -253,7 +253,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, t_place);
+    auto gpu_place = t_place;
    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
        pool.Get(gpu_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -280,7 +280,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #endif
  } else if (place_ == PlaceType::kXPU) {
 #ifdef PADDLE_WITH_XPU
-    auto xpu_place = BOOST_GET_CONST(paddle::platform::XPUPlace, t_place);
+    auto xpu_place = t_place;
    paddle::memory::Copy(paddle::platform::CPUPlace(),
                         static_cast<void *>(data), xpu_place, t_data,
                         ele_num * sizeof(T));
@@ -293,7 +293,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #ifdef PADDLE_WITH_ASCEND_CL
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto npu_place = BOOST_GET_CONST(paddle::platform::NPUPlace, t_place);
+    auto npu_place = t_place;
    auto *dev_ctx = static_cast<const paddle::platform::NPUDeviceContext *>(
        pool.Get(npu_place));
    paddle::memory::Copy(paddle::platform::CPUPlace(),

--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -134,7 +134,7 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
          "Lite::MemoryCopy CPU->GPU is not yet implemented."));
    } else if (platform::is_gpu_place(dst_place) &&
               platform::is_gpu_place(src_place)) {
-      auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
+      auto gpu_place = src_place;
      memory::Copy(
          gpu_place, dst_data, gpu_place, src_data, size,
          static_cast<const platform::CUDADeviceContext&>(ctx).stream());

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -813,8 +813,7 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
    }
 #endif

-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
  }
 #endif
@@ -838,8 +837,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
    }
 #endif

-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
  }
 #endif
@@ -859,8 +857,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
    }
 #endif

-    platform::CUDAPlace cuda_place =
-        BOOST_GET_CONST(platform::CUDAPlace, place);
+    platform::CUDAPlace cuda_place(place.GetDeviceId());
    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
  }
 #endif
@@ -935,7 +932,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
  }
 #endif

-  platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
+  platform::CUDAPlace p(place.GetDeviceId());
  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
        ->Allocate(size);

--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -19,12 +19,7 @@
 #include <map>

 #include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace platform {
-class Place;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/fluid/platform//place.h"

 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -34,7 +34,7 @@ namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
 void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      allocation->place(), place_,
      platform::errors::PermissionDenied(
          "GPU memory is freed in incorrect device. This may be a bug"));
  platform::RecordedGpuFree(allocation->ptr(), allocation->size(),

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -144,8 +144,8 @@ class CUDADeviceContextAllocatorPool {
  }

  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
-    auto iter = allocators_.find(
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()));
+    auto iter =
+        allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
    PADDLE_ENFORCE_NE(
        iter, allocators_.end(),
        platform::errors::NotFound("No allocator found for CUDAPlace."));

--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -103,7 +103,7 @@ bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }

 void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      allocation->place(), place_,
      platform::errors::PermissionDenied(
          "GPU memory is freed in incorrect device. This may be a bug"));


--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,6 +26,7 @@

 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/pten/common/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -791,7 +792,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 namespace allocation {

 pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
-  void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
+  void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
  auto *tmp_alloc = new Allocation(ptr, size, place_);
  platform::MemEvenRecorder::Instance().PushMemRecord(
      static_cast<void *>(tmp_alloc), place_, size);
@@ -799,16 +800,16 @@ pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
 }

 void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
-  boost::apply_visitor(
-      legacy::FreeVisitor(allocation->ptr(), allocation->size()),
-      allocation->place());
+  paddle::platform::VisitPlace(
+      allocation->place(),
+      legacy::FreeVisitor(allocation->ptr(), allocation->size()));
  platform::MemEvenRecorder::Instance().PopMemRecord(
      static_cast<void *>(allocation), place_);
  delete allocation;
 }

 uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
-  return boost::apply_visitor(legacy::ReleaseVisitor(), place);
+  return paddle::platform::VisitPlace(place, legacy::ReleaseVisitor());
 }

 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {
 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
 void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
  PADDLE_ENFORCE_EQ(
-      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      allocation->place(), place_,
      platform::errors::PermissionDenied(
          "NPU memory is freed in incorrect device. This may be a bug"));
  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),

--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -164,8 +164,7 @@ void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {

 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
-  std::vector<StreamSafeCUDAAllocator*>& allocators =
-      allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
+  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place];
  uint64_t released_size = 0;
  for (StreamSafeCUDAAllocator* allocator : allocators) {
    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
@@ -192,7 +191,7 @@ uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
  return underlying_allocator_->Release(place_);
 }

-std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
+std::map<platform::Place, std::vector<StreamSafeCUDAAllocator*>>
    StreamSafeCUDAAllocator::allocator_map_;
 SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;


--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -65,7 +65,7 @@ class StreamSafeCUDAAllocator : public Allocator {
  void ProcessUnfreedAllocations();
  uint64_t ProcessUnfreedAllocationsWithRelease();

-  static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
+  static std::map<platform::Place, std::vector<StreamSafeCUDAAllocator *>>
      allocator_map_;
  static SpinLock allocator_map_lock_;


--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -23,8 +23,7 @@ ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
  if (platform::is_gpu_place(place_)) {
    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
        std::unique_ptr<memory::detail::SystemAllocator>(
-            new memory::detail::GPUAllocator(
-                BOOST_GET_CONST(platform::CUDAPlace, place_).device)),
+            new memory::detail::GPUAllocator(place_.device)),
        platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
  } else {
    PADDLE_THROW(platform::errors::Unavailable(

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -16,12 +16,6 @@
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"

-namespace paddle {
-namespace platform {
-struct CUDAPlace;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 using framework::Tensor;

--- a/paddle/fluid/operators/allclose_op.cu
+++ b/paddle/fluid/operators/allclose_op.cu
@@ -25,8 +25,7 @@ struct GetTensorValue<platform::CUDADeviceContext, T> {
               const framework::Tensor& tensor) const {
    const T* data = tensor.data<T>();
    T value;
-    const auto gpu_place =
-        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    const auto gpu_place = dev_ctx.GetPlace();
    memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T),
                 dev_ctx.stream());
    return value;

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -117,9 +117,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
      h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
    }
    int64_t total_num = h_starts[xs_size];
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());

    // copy each tensor's data address to device
    auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
@@ -134,8 +133,8 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
      h_xs[i] = xs[i]->data<T>();
      h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
    }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
-                 cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_xs, cpu_place, h_xs,
+                 2 * xs_size * sizeof(T*), dev_ctx.stream());

    // Launch Kernel
    int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc
@@ -41,8 +41,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
    MPDType cpu_scale_data;
    if (platform::is_xpu_place(scale->place())) {
      memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
-                   BOOST_GET_CONST(platform::XPUPlace, scale->place()),
-                   static_cast<const void*>(scale_data), sizeof(MPDType));
+                   scale->place(), static_cast<const void*>(scale_data),
+                   sizeof(MPDType));

    } else {
      cpu_scale_data = (*scale_data);
@@ -87,8 +87,7 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
          dev_ctx.Wait();
        }
        memory::Copy(platform::CPUPlace(), &cpu_found_inf_data,
-                     BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                     found_inf_data, sizeof(bool));
+                     dev_ctx.GetPlace(), found_inf_data, sizeof(bool));
      }

      if (cpu_found_inf_data) {
@@ -142,9 +141,8 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
    if (dev_ctx.x_context()->xpu_stream) {
      dev_ctx.Wait();
    }
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 found_inf_data, platform::CPUPlace(), &cpu_found_inf_data,
-                 sizeof(bool));
+    memory::Copy(dev_ctx.GetPlace(), found_inf_data, platform::CPUPlace(),
+                 &cpu_found_inf_data, sizeof(bool));
  }
 };


--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -114,9 +114,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
    for (int i = 0; i < xs_size; i++) {
      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
    }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_starts, cpu_place, h_starts,
+                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());

    // copy each tensor of "outs" data address array to device
    auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
@@ -128,9 +127,8 @@ class LazyZeros<platform::CUDADeviceContext, T> {
    for (size_t i = 0; i < xs_size; ++i) {
      h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
    }
-    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
-                 dev_ctx.stream());
+    memory::Copy(dev_ctx.GetPlace(), d_out_addrs, cpu_place, h_out_addrs,
+                 xs_size * sizeof(T*), dev_ctx.stream());

    // launch cuda kernel
    int64_t total_num = h_starts[xs_size];

--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -187,9 +187,7 @@ class LazyZerosNPU {
        framework::TensorCopy(*x, place, dev_ctx, out);
      } else if (zero_ptr != dst_ptr) {
        auto size = out->numel() * framework::SizeOfType(out->type());
-        memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr,
-                     BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
-                     stream);
+        memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
      }
    }
  }

--- a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc
@@ -43,8 +43,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
    bool cpu_found_inf_data = false;
    if (platform::is_xpu_place(found_inf->place())) {
      memory::Copy(platform::CPUPlace(),
-                   static_cast<void*>(&cpu_found_inf_data),
-                   BOOST_GET_CONST(platform::XPUPlace, found_inf->place()),
+                   static_cast<void*>(&cpu_found_inf_data), found_inf->place(),
                   static_cast<const void*>(found_inf_data), sizeof(bool));
    } else {
      cpu_found_inf_data = (*found_inf_data);
@@ -97,16 +96,16 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
    MPDType cpu_pre_loss_scaling_data;
    if (platform::is_xpu_place(bad_in->place())) {
      memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data),
-                   BOOST_GET_CONST(platform::XPUPlace, bad_in->place()),
-                   static_cast<const void*>(bad_in_data), sizeof(int));
+                   bad_in->place(), static_cast<const void*>(bad_in_data),
+                   sizeof(int));
    } else {
      cpu_bad_in_data = (*bad_in_data);
    }

    if (platform::is_xpu_place(good_in->place())) {
      memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data),
-                   BOOST_GET_CONST(platform::XPUPlace, good_in->place()),
-                   static_cast<const void*>(good_in_data), sizeof(int));
+                   good_in->place(), static_cast<const void*>(good_in_data),
+                   sizeof(int));
    } else {
      cpu_good_in_data = (*good_in_data);
    }
@@ -114,7 +113,7 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
    if (platform::is_xpu_place(pre_loss_scaling->place())) {
      memory::Copy(
          platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data),
-          BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()),
+          pre_loss_scaling->place(),
          static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType));
    } else {
      cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
@@ -146,15 +145,13 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
      }
    }
    // copy to device
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 bad_out_data, platform::CPUPlace(), &cpu_bad_out_data,
-                 sizeof(int));
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 good_out_data, platform::CPUPlace(), &cpu_good_out_data,
-                 sizeof(int));
-    memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()),
-                 updated_loss_scaling_data, platform::CPUPlace(),
-                 &cpu_updated_loss_scaling_data, sizeof(MPDType));
+    memory::Copy(dev_ctx.GetPlace(), bad_out_data, platform::CPUPlace(),
+                 &cpu_bad_out_data, sizeof(int));
+    memory::Copy(dev_ctx.GetPlace(), good_out_data, platform::CPUPlace(),
+                 &cpu_good_out_data, sizeof(int));
+    memory::Copy(dev_ctx.GetPlace(), updated_loss_scaling_data,
+                 platform::CPUPlace(), &cpu_updated_loss_scaling_data,
+                 sizeof(MPDType));
  }
 };


--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -25,8 +25,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -27,8 +27,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/assign_op_xpu.cc
+++ b/paddle/fluid/operators/assign_op_xpu.cc
@@ -26,8 +26,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
-struct CUDAPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-struct CPUPlace;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/average_accumulates_op.cu
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -25,8 +25,7 @@ void GetAccumulators<paddle::platform::CUDADeviceContext>(
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
  auto stream = ctx.cuda_device_context().stream();
-  auto cuda_place =
-      BOOST_GET_CONST(platform::CUDAPlace, in_old_num_accumulates->place());
+  auto cuda_place = in_old_num_accumulates->place();
  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
               stream);
@@ -44,8 +43,7 @@ void SetAccumulators<paddle::platform::CUDADeviceContext>(
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
-  auto cuda_place =
-      BOOST_GET_CONST(platform::CUDAPlace, out_old_num_accumulates->place());
+  auto cuda_place = out_old_num_accumulates->place();

  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),

--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -57,8 +57,7 @@ class BernoulliOpKernel<platform::CUDADeviceContext, T>
    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
    int64_t size = x->numel();

-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    auto seed_offset = gen_cuda->IncrementOffset(1);
    int64_t gen_offset = size * seed_offset.second;

--- a/paddle/fluid/operators/cholesky_op.cu
+++ b/paddle/fluid/operators/cholesky_op.cu
@@ -102,8 +102,7 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
    std::vector<int> error_info;  // only for checking positive matrix
    error_info.resize(batch_count);

-    memory::Copy(platform::CPUPlace(), error_info.data(),
-                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+    memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(),
                 info_ptr, sizeof(int) * batch_count, dev_ctx.stream());

    for (int i = 0; i < batch_count; ++i) {

--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -306,7 +306,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                          num_classes, num_samples));

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto place = dev_ctx.GetPlace();

    int batch_size = label->numel();
    // Algorithm:
@@ -397,8 +397,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
                       (NumBlocks(num_classes) * kNumCUDAThreads * vec_size) +
                   1) *
                  vec_size;
-    int device_id =
-        BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+    int device_id = ctx.GetPlace().GetDeviceId();
    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
    if (gen_cuda->GetIsInitPy() && (!fix_seed)) {
      auto seed_offset = gen_cuda->IncrementOffset(offset);

--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -33,7 +33,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(place), true,
                      platform::errors::PreconditionNotMet(
                          "AllReduce op can run on gpu place only for now."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)

--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -34,7 +34,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
            "The place of ExecutionContext should be CUDAPlace."));

 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
+    int dev_id = ctx.GetPlace().device;
    int root_dev_id = ctx.Attr<int>("root");

    auto in = ctx.Input<framework::Tensor>("X");

--- a/paddle/fluid/operators/collective/broadcast_op_xpu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op_xpu.cc
@@ -40,7 +40,7 @@ class BKCLBroadcastOpKernel : public framework::OpKernel<T> {
                          "The place of ExecutionContext should be XPUPlace."));

 #if defined(PADDLE_WITH_XPU_BKCL)
-    int dev_id = BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()).device;
+    int dev_id = ctx.GetPlace().device;
    int root_dev_id = ctx.Attr<int>("root");

    auto in = ctx.Input<framework::Tensor>("X");

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -24,7 +24,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct XPUPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -22,7 +22,6 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 namespace platform {
-struct CPUPlace;
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace platform {
-struct CUDAPlace;
+
 struct float16;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op.cu.cc
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
--- a/paddle/fluid/operators/cumprod_op.cu
+++ b/paddle/fluid/operators/cumprod_op.cu
--- a/paddle/fluid/operators/cumprod_op.h
+++ b/paddle/fluid/operators/cumprod_op.h
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
--- a/paddle/fluid/operators/detection/box_coder_op.cu
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
--- a/paddle/fluid/operators/dirichlet_op.cu
+++ b/paddle/fluid/operators/dirichlet_op.cu
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op.h
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
--- a/paddle/fluid/operators/dropout_impl_util.h
+++ b/paddle/fluid/operators/dropout_impl_util.h
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_div_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_sub_mkldnn_op.cc
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
--- a/paddle/fluid/operators/flip_op.cu
+++ b/paddle/fluid/operators/flip_op.cu
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
--- a/paddle/fluid/operators/gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/gaussian_random_op_xpu.cc
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/isclose_op.cu
+++ b/paddle/fluid/operators/isclose_op.cu
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
--- a/paddle/fluid/operators/jit/kernel_key.h
+++ b/paddle/fluid/operators/jit/kernel_key.h
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
--- a/paddle/fluid/operators/masked_select_op_xpu.cc
+++ b/paddle/fluid/operators/masked_select_op_xpu.cc
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
--- a/paddle/fluid/operators/memcpy_d2h_op.cc
+++ b/paddle/fluid/operators/memcpy_d2h_op.cc
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
--- a/paddle/fluid/operators/multiplex_op.h
+++ b/paddle/fluid/operators/multiplex_op.h
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
--- a/paddle/fluid/operators/poisson_op.cu
+++ b/paddle/fluid/operators/poisson_op.cu
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
--- a/paddle/fluid/operators/range_op_xpu.cc
+++ b/paddle/fluid/operators/range_op_xpu.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
--- a/paddle/fluid/operators/split_op_npu.cc
+++ b/paddle/fluid/operators/split_op_npu.cc
--- a/paddle/fluid/operators/stack_op.cu
+++ b/paddle/fluid/operators/stack_op.cu
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/svd_op.cu
+++ b/paddle/fluid/operators/svd_op.cu
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
--- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
--- a/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_inplace_op_xpu.cc
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
--- a/paddle/fluid/operators/uniform_random_op_xpu.cc
+++ b/paddle/fluid/operators/uniform_random_op_xpu.cc
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
--- a/paddle/fluid/operators/where_index_op_xpu.cc
+++ b/paddle/fluid/operators/where_index_op_xpu.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
--- a/paddle/fluid/platform/device/npu/npu_collective_helper.cc
+++ b/paddle/fluid/platform/device/npu/npu_collective_helper.cc
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
--- a/paddle/fluid/platform/device/npu/npu_stream.cc
+++ b/paddle/fluid/platform/device/npu/npu_stream.cc
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/platform/place_test.cc
+++ b/paddle/fluid/platform/place_test.cc
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
--- a/paddle/pten/common/place.cc
+++ b/paddle/pten/common/place.cc
--- a/paddle/pten/common/place.h
+++ b/paddle/pten/common/place.h
--- a/paddle/pten/kernels/cpu/copy_kernel.cc
+++ b/paddle/pten/kernels/cpu/copy_kernel.cc
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
--- a/paddle/pten/kernels/gpu/copy_kernel.cu
+++ b/paddle/pten/kernels/gpu/copy_kernel.cu
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
--- a/paddle/pten/kernels/xpu/copy_kernel.cc
+++ b/paddle/pten/kernels/xpu/copy_kernel.cc
--- a/paddle/pten/tests/common/test_place.cc
+++ b/paddle/pten/tests/common/test_place.cc
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py