unify cpu context, part2 (#44012)

* fix init() * delete test_device_context * replace CPUDeviceContext with CPUContext * fix test_scalar * remove dot_op.cc * fix compile

unify cpu context, part2 (#44012)
* fix init() * delete test_device_context * replace CPUDeviceContext with CPUContext * fix test_scalar * remove dot_op.cc * fix compile
755438a7 · Leo Chen · GitHub · 09096aeb · 755438a7 · 755438a7
289 changed file
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -251,7 +251,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
        "Please recompile or reinstall Paddle with NCCL support."));
 #endif
  } else if (platform::is_cpu_place(place)) {
-    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+    auto *default_ctx = static_cast<phi::CPUContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    ConcatTensorsWithType(
        *default_ctx, dense_tensors_, &dense_contents_, dtype_);
@@ -274,7 +274,7 @@ void EagerGroup::SplitTensors(const platform::Place &place) {
        "Please recompile or reinstall Paddle with NCCL support."));
 #endif
  } else if (platform::is_cpu_place(place)) {
-    auto *default_ctx = static_cast<platform::CPUDeviceContext *>(
+    auto *default_ctx = static_cast<phi::CPUContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    SplitTensorsWithType(
        *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
@@ -891,7 +891,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
        "Please recompile or reinstall Paddle with NCCL support."));
 #endif
  } else if (platform::is_cpu_place(inner_place_)) {
-    dev_ctx = static_cast<platform::CPUDeviceContext *>(
+    dev_ctx = static_cast<phi::CPUContext *>(
        platform::DeviceContextPool::Instance().Get(inner_place_));
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -31,9 +31,9 @@ namespace paddle {
 namespace distributed {

 template <typename T>
-inline phi::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
-  paddle::platform::CPUDeviceContext cpu_ctx;
-  return phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
+inline phi::funcs::BlasT<phi::CPUContext, T> GetBlas() {
+  phi::CPUContext cpu_ctx;
+  return phi::funcs::GetBlas<phi::CPUContext, T>(cpu_ctx);
 }

 template <typename T>

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -353,11 +353,12 @@ void Communicator::RpcRecvSparse(const std::string &varname,

  bool training = true;

-  auto status = _worker_ptr->PullSparseParam((float **)push_g_vec.data(),
-                                             table_id,  // NOLINT
-                                             sparse_push_keys.data(),
-                                             sparse_push_keys.size(),
-                                             training);
+  auto status =
+      _worker_ptr->PullSparseParam(static_cast<float **>(push_g_vec.data()),
+                                   table_id,
+                                   sparse_push_keys.data(),
+                                   sparse_push_keys.size(),
+                                   training);
  status.wait();
  return;
 }
@@ -1184,12 +1185,12 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
    auto &t_latest = var_latest->Get<framework::LoDTensor>();
    auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();

-    paddle::platform::CPUDeviceContext cpu_ctx;
+    phi::CPUContext cpu_ctx;
    auto *var_delta = delta_scope_->Var(varname);
    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());

-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
    blas.VSUB(t_latest.numel(),
              t_latest.data<float>(),
              t_timestamp->data<float>(),
@@ -1218,7 +1219,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
  RpcRecvDense(varnames, table_id, pserver_scope_.get());

  // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver
-  paddle::platform::CPUDeviceContext cpu_ctx;
+  phi::CPUContext cpu_ctx;
  for (auto &varname : varnames) {
    auto *var_latest = recv_scope_->FindVar(varname);
    auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
@@ -1233,7 +1234,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
    auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
    t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());

-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
    blas.VSUB(t_latest->numel(),
              t_pserver.data<float>(),
              t_old->data<float>(),
@@ -1334,7 +1335,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  auto *t_old = var_old->GetMutable<framework::LoDTensor>();

  auto dims1 = t_latest.dims()[1];
-  paddle::platform::CPUDeviceContext cpu_ctx;
+  phi::CPUContext cpu_ctx;

  auto *var_delta = delta_scope_->Var(varname);
  auto *t_delta = var_delta->GetMutable<phi::SelectedRows>();
@@ -1345,7 +1346,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  t_delta->set_rows(sparse_ids);
  t_delta->set_height(t_latest.dims()[0]);

-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);
  float coefficient = 1.0 / static_cast<float>(trainers_);

  std::vector<float *> push_g_vec;
@@ -1419,8 +1420,8 @@ void GeoCommunicator::RecvSparse(const std::string &varname,
  std::vector<float> v_delta;
  v_delta.resize(numel);

-  paddle::platform::CPUDeviceContext cpu_ctx;
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
+  phi::CPUContext cpu_ctx;
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, float>(cpu_ctx);

  for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
    VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -185,9 +185,8 @@ inline void MergeVars(const std::string &var_name,
    }

    // set output tensor to 0.
-    paddle::platform::CPUDeviceContext cpu_ctx;
-    phi::funcs::SetConstant<paddle::platform::CPUDeviceContext, T>
-        constant_functor;
+    phi::CPUContext cpu_ctx;
+    phi::funcs::SetConstant<phi::CPUContext, T> constant_functor;
    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
    // sum all vars to out
    auto result = EigenVector<T>::Flatten(*out_t);
@@ -210,16 +209,13 @@ inline void MergeVars(const std::string &var_name,
    for (auto &var : vars) {
      inputs.push_back(&var->Get<phi::SelectedRows>());
    }
-    paddle::platform::CPUDeviceContext dev_ctx;
+    phi::CPUContext dev_ctx;
    if (merge_add) {
-      paddle::operators::math::scatter::
-          MergeAdd<paddle::platform::CPUDeviceContext, T>
-              merge_add;
+      paddle::operators::math::scatter::MergeAdd<phi::CPUContext, T> merge_add;
      merge_add(dev_ctx, inputs, out_slr);
    } else {
-      paddle::operators::math::scatter::
-          MergeAverage<paddle::platform::CPUDeviceContext, T>
-              merge_average;
+      paddle::operators::math::scatter::MergeAverage<phi::CPUContext, T>
+          merge_average;
      merge_average(dev_ctx, inputs, out_slr);
    }


--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -48,8 +48,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 #endif
      return;
    }
-    paddle::framework::details::tensor_check<
-        paddle::platform::CPUDeviceContext>(
+    paddle::framework::details::tensor_check<phi::CPUContext>(
        api_name, tensor_name, *dense_tensor, place);
  }
 }

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -90,9 +90,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
    test_op,
    paddle::framework::TestOpWithKernel,
    paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(test_op,
+                       paddle::framework::TestKernel<phi::CPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
    test_op,
    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);

--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -44,8 +44,8 @@ void CastDataLayout::apply() {
  auto place = ctx_->GetPlace();

  if (platform::is_cpu_place(place)) {
-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans4;
-    auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans4;
+    auto* context = static_cast<const phi::CPUContext*>(ctx_);
    trans4(*context, in_, out_, axis_);
  } else {
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -94,8 +94,8 @@ struct CastDataType {
    auto* out_begin = out_->mutable_data<OutType>(in_.place());

    if (platform::is_cpu_place(in_.place())) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      platform::Transform<phi::CPUContext> trans;
+      auto* context = static_cast<const phi::CPUContext*>(ctx_);
      trans(*context,
            in_begin,
            in_end,

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -117,7 +117,7 @@ struct TestBroadcastOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
      }
 #if defined(PADDLE_WITH_XPU_BKCL)
      bkcl_ctxs_.reset(nullptr);

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -69,7 +69,7 @@ struct TestGatherOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
      }
    }
  }

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -316,7 +316,7 @@ template <>

 template <>
 template <typename T>
-void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
+void TensorCheckerVisitor<phi::CPUContext>::apply(
    typename std::enable_if<
        std::is_floating_point<T>::value ||
        std::is_same<T, ::paddle::platform::complex<float>>::value ||
@@ -329,11 +329,11 @@ void TensorCheckerVisitor<platform::CPUDeviceContext>::apply(
 }

 template <>
-void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
-                                              const std::string& var_name,
-                                              const framework::Tensor& tensor,
-                                              const platform::Place& place) {
-  TensorCheckerVisitor<platform::CPUDeviceContext> vistor(
+void tensor_check<phi::CPUContext>(const std::string& op_type,
+                                   const std::string& var_name,
+                                   const framework::Tensor& tensor,
+                                   const platform::Place& place) {
+  TensorCheckerVisitor<phi::CPUContext> vistor(
      op_type, var_name, tensor, place);
  VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
 }
@@ -439,7 +439,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 #endif
    return;
  }
-  tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
+  tensor_check<phi::CPUContext>(op_type, var_name, *tensor, place);
 }

 void CheckVarHasNanOrInf(const std::string& op_type,

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -81,7 +81,7 @@ struct TestReduceOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CPUPlace();
        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+        ctxs_.emplace_back(new phi::CPUContext(p));
      }
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      nccl_ctxs_.reset(nullptr);

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -144,7 +144,7 @@ LoDTensor LodExpand(const LoDTensor& source,
      auto slice = tensor.Slice(elem, elem + 1);
      TensorCopy(source.Slice(ins, ins + 1),
                 platform::CPUPlace(),
-                 platform::CPUDeviceContext(),
+                 phi::CPUContext(),
                 &slice);
    }
  }

--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -232,9 +232,8 @@ class OpKernelTest : public paddle::framework::OpKernel<T> {
 REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
                             paddle::framework::OpWithKernelTest,
                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_CPU_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::OpKernelTest<phi::CPUContext, float>);

 REGISTER_OP_CUDA_KERNEL(
    op_with_kernel,
@@ -264,10 +263,9 @@ TEST(OperatorRegistrar, CUDA) {
 }

 static int op_test_value = 0;
-
-using paddle::platform::CPUDeviceContext;
 using paddle::platform::CUDADeviceContext;
 using paddle::platform::DeviceContext;
+using phi::CPUContext;

 namespace paddle {
 namespace framework {
@@ -295,8 +293,7 @@ class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
 };

 template <typename T>
-class OpMultiKernelTest<CPUDeviceContext, T>
-    : public paddle::framework::OpKernel<T> {
+class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const {
    ++op_test_value;
@@ -319,7 +316,7 @@ class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
 };

 template <typename T>
-class OpMultiKernelTest2<CPUDeviceContext, T>
+class OpMultiKernelTest2<CPUContext, T>
    : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -342,16 +339,14 @@ class OpMultiKernelTest2<CUDADeviceContext, T>
 REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
                             paddle::framework::OpWithMultiKernelTest,
                             paddle::framework::OpKernelTestMaker);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel,
-    CPU,
-    paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
-REGISTER_OP_KERNEL(
-    op_with_multi_kernel,
-    MKLDNN,
-    paddle::platform::CPUPlace,
-    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(op_with_multi_kernel,
+                   CPU,
+                   paddle::platform::CPUPlace,
+                   paddle::framework::OpMultiKernelTest<CPUContext, float>);
+REGISTER_OP_KERNEL(op_with_multi_kernel,
+                   MKLDNN,
+                   paddle::platform::CPUPlace,
+                   paddle::framework::OpMultiKernelTest2<CPUContext, float>);
 REGISTER_OP_KERNEL(
    op_with_multi_kernel,
    CUDA,

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -420,16 +420,13 @@ REGISTER_OP_WITHOUT_GRADIENT(

 REGISTER_OP_CPU_KERNEL(
    indicate_lod_tensor_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);
 REGISTER_OP_CPU_KERNEL(
    indicate_selected_rows_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);
 REGISTER_OP_CPU_KERNEL(
    indicate_other_data_type_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       int>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, int>);

 TEST(IndicateVarDataTypeTest, lodtensor) {
  paddle::framework::InitDevices();
@@ -599,16 +596,14 @@ REGISTER_OP_WITHOUT_GRADIENT(get_lod_level_test,
                             paddle::framework::GetSetLoDLevelTestMaker);
 REGISTER_OP_CPU_KERNEL(
    get_lod_level_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       float>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, float>);

 REGISTER_OP_WITHOUT_GRADIENT(set_lod_level_test,
                             paddle::framework::SetLoDLevelTest,
                             paddle::framework::GetSetLoDLevelTestMaker);
 REGISTER_OP_CPU_KERNEL(
    set_lod_level_test,
-    paddle::framework::EmptyTestKernel<paddle::platform::CPUDeviceContext,
-                                       float>);
+    paddle::framework::EmptyTestKernel<phi::CPUContext, float>);

 void SetGetLoDLevelTestMain(std::string op_type) {
  paddle::framework::InitDevices({});

--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -66,7 +66,7 @@ struct ConvertToPhiContext {
 };

 template <>
-struct ConvertToPhiContext<platform::CPUDeviceContext> {
+struct ConvertToPhiContext<phi::CPUContext> {
  using TYPE = phi::CPUContext;
 };


--- a/paddle/fluid/framework/selected_rows_utils_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -53,7 +53,7 @@ TEST_F(SelectedRowsTester, complete_dims) {

 TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  phi::SelectedRows dst_tensor;
-  platform::CPUDeviceContext cpu_ctx(place_);
+  phi::CPUContext cpu_ctx(place_);
  std::ostringstream oss;

  SerializeToStream(oss, *selected_rows_, cpu_ctx);

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1253,7 +1253,7 @@ void TensorFromStream(std::istream& is,
    is.seekg(seekg, is.cur);

    void* buf;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||
@@ -1336,7 +1336,7 @@ void TensorFromStream(std::istream& is,
    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
    tensor->Resize(phi::make_ddim(dims));
    void* buf;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace()) ||

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -24,7 +24,7 @@ namespace framework {
 TEST(TensorCopy, Tensor) {
  Tensor src_tensor;
  Tensor dst_tensor;
-  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+  phi::CPUContext cpu_ctx((platform::CPUPlace()));

  int* src_ptr = src_tensor.mutable_data<int>(phi::make_ddim({3, 3}),
                                              platform::CPUPlace());
@@ -164,7 +164,7 @@ TEST(TensorFromVector, Tensor) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(phi::make_ddim({3, 3}));
    auto cpu_place = new paddle::platform::CPUPlace();
-    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+    phi::CPUContext cpu_ctx(*cpu_place);
    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);

    // Copy to GPUTensor
@@ -255,20 +255,23 @@ TEST(TensorToVector, Tensor) {
 #endif
 }

-TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
-bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-for (int i = 0; i < 3 * 3; ++i) {
-  src_ptr[i] = static_cast<bool>(i % 2);
-}
+TEST(TensorToVector, Tensor_bool) {
+{
+  paddle::framework::Tensor src;
+  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 3 * 3; ++i) {
+    src_ptr[i] = static_cast<bool>(i % 2);
+  }

-paddle::platform::CPUPlace place;
-std::vector<bool> dst;
-paddle::framework::TensorToVector<bool>(src, &dst);
+  paddle::platform::CPUPlace place;
+  std::vector<bool> dst;
+  paddle::framework::TensorToVector<bool>(src, &dst);

-for (int i = 0; i < 3 * 3; ++i) {
-  EXPECT_EQ(src_ptr[i], dst[i]);
+  for (int i = 0; i < 3 * 3; ++i) {
+    EXPECT_EQ(src_ptr[i], dst[i]);
+  }
 }
-}  // namespace framework
+
 #ifdef PADDLE_WITH_CUDA
 {
  std::vector<bool> src_vec = {
@@ -325,7 +328,7 @@ for (int i = 0; i < 3 * 3; ++i) {
  }
 }
 #endif
-}  // namespace paddle
+}

 TEST(TensorFromDLPack, Tensor) {
  {
@@ -334,7 +337,7 @@ TEST(TensorFromDLPack, Tensor) {

    cpu_tensor.Resize(phi::make_ddim({3, 3}));
    paddle::platform::CPUPlace cpu_place;
-    paddle::platform::CPUDeviceContext cpu_ctx(cpu_place);
+    phi::CPUContext cpu_ctx(cpu_place);
    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
    paddle::framework::DLPackTensor dlpack_tensor(cpu_tensor, 1);

@@ -360,7 +363,7 @@ TEST(TensorFromDLPack, Tensor) {
    // Copy to CPU Tensor
    cpu_tensor.Resize(phi::make_ddim({3, 3}));
    paddle::platform::CPUPlace cpu_place;
-    paddle::platform::CPUDeviceContext cpu_ctx(cpu_place);
+    phi::CPUContext cpu_ctx(cpu_place);
    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);

    // Copy to GPUTensor
@@ -502,7 +505,7 @@ TEST(Tensor, FromAndToStream) {
  {
    framework::Tensor dst_tensor;
    auto place = new platform::CPUPlace();
-    platform::CPUDeviceContext cpu_ctx(*place);
+    phi::CPUContext cpu_ctx(*place);
    std::ostringstream oss;
    TensorToStream(oss, src_tensor, cpu_ctx);


--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -46,8 +46,8 @@ void GLOOParallelContext::Init() {
  int port = std::stoi(addr[1]);
  gloo_wrapper->SetHttpStore(host, port, "worker");
  gloo_wrapper->Init();
-  device_ = std::unique_ptr<platform::CPUDeviceContext>(
-      new platform::CPUDeviceContext(platform::CPUPlace()));
+  device_ = std::unique_ptr<phi::CPUContext>(
+      new phi::CPUContext(platform::CPUPlace()));
  device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CPUPlace())
                            .get());
@@ -200,7 +200,7 @@ void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) {

 paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext(
    int ring_id) {
-  // return the CPUDeviceContext
+  // return the CPUContext
  return device_.get();
 }


--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -64,7 +64,7 @@ class GLOOParallelContext : public ParallelContext {
  void AllReduce(const phi::SelectedRows& src, phi::SelectedRows* dst);

 private:
-  std::unique_ptr<platform::CPUDeviceContext> device_;
+  std::unique_ptr<phi::CPUContext> device_;
 };

 }  //  namespace imperative

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -85,9 +85,9 @@ class TensorAddFunctor : public boost::static_visitor<> {
      : numel_(numel), x_(x), y_(y) {}

  void operator()(const platform::CPUPlace& place) const {
-    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
+    phi::CPUContext* ctx = dynamic_cast<phi::CPUContext*>(
        platform::DeviceContextPool::Instance().Get(place));
-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }

@@ -438,7 +438,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
          place));
 #endif
    } else if (platform::is_cpu_place(place)) {
-      return TensorAddImpl<platform::CPUDeviceContext, platform::float16>(
+      return TensorAddImpl<phi::CPUContext, platform::float16>(
          src_tensor, dst_tensor, place);
    }
  }
@@ -455,7 +455,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
          place));
 #endif
    } else if (platform::is_cpu_place(place)) {
-      return TensorAddImpl<platform::CPUDeviceContext, platform::bfloat16>(
+      return TensorAddImpl<phi::CPUContext, platform::bfloat16>(
          src_tensor, dst_tensor, place);
    }
  }
@@ -498,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
  } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  }
 #endif
@@ -550,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
  } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  }
 #endif
@@ -613,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
    PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
  } else {
 #endif
-    PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  }
 #endif

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -53,12 +53,11 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
    }
    framework::VisitDataTypeForHIP(
        dtype_,
-        DivNRanksForAllReduce<platform::CPUDeviceContext>(
-            tensor, nranks, context));
+        DivNRanksForAllReduce<phi::CPUContext>(tensor, nranks, context));
 #else
-    framework::VisitDataType(dtype_,
-                             DivNRanksForAllReduce<platform::CPUDeviceContext>(
-                                 tensor, nranks, context));
+    framework::VisitDataType(
+        dtype_,
+        DivNRanksForAllReduce<phi::CPUContext>(tensor, nranks, context));
 #endif
    VLOG(4) << "after div 2" << *tensor;
  } else if (platform::is_xpu_place(tensor->place())) {
@@ -328,11 +327,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
  } else if (platform::is_cpu_place(place)) {
-    ConcatTensorsWithType(
-        static_cast<const platform::CPUDeviceContext &>(context),
-        dense_tensors_,
-        &dense_contents_,
-        dtype_);
+    ConcatTensorsWithType(static_cast<const phi::CPUContext &>(context),
+                          dense_tensors_,
+                          &dense_contents_,
+                          dtype_);
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Concat grad tensor not supported on place (%s)", place));
@@ -390,11 +388,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
        "Please recompile or reinstall Paddle with CNCL support."));
 #endif
  } else if (platform::is_cpu_place(place)) {
-    SplitTensorsWithType(
-        static_cast<const platform::CPUDeviceContext &>(context),
-        &dense_contents_,
-        &dense_tensors_,
-        dtype_);
+    SplitTensorsWithType(static_cast<const phi::CPUContext &>(context),
+                         &dense_contents_,
+                         &dense_tensors_,
+                         dtype_);
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Split grad tensor not supported on place (%s)", place));

--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -234,7 +234,7 @@ void LiteSubgraphPass::SetUpEngine(
                             framework::Scope* scope,
                             const std::vector<std::string>& params) {
    std::ostringstream os;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
    for (const auto& param : params) {
      VLOG(3) << "Serialize param: " << param;
      PADDLE_ENFORCE_NOT_NULL(

--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -365,7 +365,7 @@ void ConvertToMixedPrecision(const std::string& model_file,
      [](framework::Scope* scope,
         const std::vector<std::string>& params) -> std::string {
    std::ostringstream os;
-    platform::CPUDeviceContext ctx;
+    phi::CPUContext ctx;
    for (const auto& param : params) {
      VLOG(3) << "Serialize param: " << param;
      PADDLE_ENFORCE_NOT_NULL(

--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -81,7 +81,7 @@ void make_fake_model(std::string* model, std::string* param) {
  ctx.PartialInitWithAllocator();
 #else
  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
 #endif
  // Prepare variables.
  std::vector<std::string> repetitive_params{"x", "y"};

--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -62,7 +62,7 @@ void IOConverterTester(const platform::DeviceContext& ctx) {

 TEST(EngineIOConverterTester, DefaultCPU) {
  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
  IOConverterTester(ctx);
 }


--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1469,20 +1469,16 @@ namespace plat = paddle::platform;
                    ops::ActivationOpGrad,                                  \
                    ops::ActivationGradOpInplaceInferer);

-#define REGISTER_ACTIVATION_CPU_KERNEL(                             \
-    act_type, op_name, functor, grad_functor)                       \
-  REGISTER_OP_CPU_KERNEL(                                           \
-      act_type,                                                     \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
-                            ops::functor<float>>,                   \
-      ops::ActivationKernel<paddle::platform::CPUDeviceContext,     \
-                            ops::functor<double>>);                 \
-  REGISTER_OP_CPU_KERNEL(                                           \
-      act_type##_grad,                                              \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
-                                ops::grad_functor<float>>,          \
-      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
-                                ops::grad_functor<double>>);
+#define REGISTER_ACTIVATION_CPU_KERNEL(                                     \
+    act_type, op_name, functor, grad_functor)                               \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      act_type,                                                             \
+      ops::ActivationKernel<phi::CPUContext, ops::functor<float>>,          \
+      ops::ActivationKernel<phi::CPUContext, ops::functor<double>>);        \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      act_type##_grad,                                                      \
+      ops::ActivationGradKernel<phi::CPUContext, ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<phi::CPUContext, ops::grad_functor<double>>);

 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);

--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -122,12 +122,11 @@ REGISTER_OPERATOR(
    ops::AddPositionEncodingGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);

-REGISTER_OP_CPU_KERNEL(
-    add_position_encoding,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingKernel<plt::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(add_position_encoding,
+                       ops::AddPositionEncodingKernel<phi::CPUContext, float>,
+                       ops::AddPositionEncodingKernel<phi::CPUContext, double>);

 REGISTER_OP_CPU_KERNEL(
    add_position_encoding_grad,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, float>,
-    ops::AddPositionEncodingGradKernel<plt::CPUDeviceContext, double>);
+    ops::AddPositionEncodingGradKernel<phi::CPUContext, float>,
+    ops::AddPositionEncodingGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -342,7 +342,7 @@ DECLARE_INPLACE_OP_INFERER(AffineChannelGradInplaceInferer,
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(affine_channel,
                  ops::AffineChannelOp,

--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -28,7 +28,7 @@ namespace operators {
 using Tensor = framework::Tensor;

 template <typename T>
-struct Linspace<paddle::platform::CPUDeviceContext, T> {
+struct Linspace<phi::CPUContext, T> {
  void operator()(T start,
                  T end,
                  int count,
@@ -282,14 +282,12 @@ REGISTER_OPERATOR(affine_grid,
                  ops::AffineGridGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);

-REGISTER_OP_CPU_KERNEL(
-    affine_grid,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridOpKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    affine_grid_grad,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AffineGridGradOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(affine_grid,
+                       ops::AffineGridOpKernel<phi::CPUContext, float>,
+                       ops::AffineGridOpKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(affine_grid_grad,
+                       ops::AffineGridGradOpKernel<phi::CPUContext, float>,
+                       ops::AffineGridGradOpKernel<phi::CPUContext, double>);

 REGISTER_OP_VERSION(affine_grid)
    .AddCheckpoint(

--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -84,7 +84,7 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 DECLARE_INFER_SHAPE_FUNCTOR(allclose,
                            AllcloseInferShapeFunctor,

--- a/paddle/fluid/operators/amp/alloc_float_status_op.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -65,7 +65,7 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    alloc_float_status,

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -95,7 +95,7 @@ template <typename T>
 class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
    const auto xs = ctx.MultiInput<framework::Tensor>("X");
    const auto* scale = ctx.Input<framework::Tensor>("Scale");
    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
@@ -106,11 +106,10 @@ class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {

    *found_inf_data = false;
    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+        ctx.AllocateTmpTensor<bool, phi::CPUContext>({1}, dev_ctx);
    bool* is_finite_data = is_finite.template data<bool>();

-    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
-                     .eigen_device();
+    auto& dev = *ctx.template device_context<phi::CPUContext>().eigen_device();

    T inverse_scale = Inverse<T>(*scale_data);
    for (size_t i = 0; i < xs.size(); ++i) {

--- a/paddle/fluid/operators/amp/clear_float_status_op.cc
+++ b/paddle/fluid/operators/amp/clear_float_status_op.cc
@@ -68,7 +68,7 @@ class ClearFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    clear_float_status,

--- a/paddle/fluid/operators/amp/get_float_status_op.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op.cc
@@ -67,7 +67,7 @@ class GetFloatStatusKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    get_float_status,

--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -169,9 +169,9 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite.
 };

 template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
+class UpdateLossScalingFunctor<phi::CPUContext, T, IsFoundInfOnCPU> {
 public:
-  void operator()(const platform::CPUDeviceContext& ctx,
+  void operator()(const phi::CPUContext& ctx,
                  const bool* found_inf_data,
                  const T* pre_loss_scaling_data,
                  const int* good_in_data,
@@ -203,9 +203,9 @@ class UpdateLossScalingFunctor<platform::CPUDeviceContext, T, IsFoundInfOnCPU> {
 };

 template <typename T>
-class LazyZeros<platform::CPUDeviceContext, T> {
+class LazyZeros<phi::CPUContext, T> {
 public:
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+  void operator()(const phi::CPUContext& dev_ctx,
                  const bool* found_inf_data,
                  const std::vector<const framework::Tensor*>& xs,
                  const std::vector<framework::Tensor*>& outs) const {
@@ -225,7 +225,7 @@ class LazyZeros<platform::CPUDeviceContext, T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    update_loss_scaling,

--- a/paddle/fluid/operators/angle_op.cc
+++ b/paddle/fluid/operators/angle_op.cc
@@ -116,20 +116,16 @@ REGISTER_OPERATOR(angle,

 REGISTER_OP_CPU_KERNEL(
    angle,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::AngleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex<double>>);
+    ops::AngleKernel<phi::CPUContext, float>,
+    ops::AngleKernel<phi::CPUContext, double>,
+    ops::AngleKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::AngleKernel<phi::CPUContext, paddle::platform::complex<double>>);

 REGISTER_OPERATOR(angle_grad, ops::AngleGradOp);

 REGISTER_OP_CPU_KERNEL(
    angle_grad,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::AngleGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>);
+    ops::AngleGradKernel<phi::CPUContext, float>,
+    ops::AngleGradKernel<phi::CPUContext, double>,
+    ops::AngleGradKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::AngleGradKernel<phi::CPUContext, paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -51,7 +51,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor<void> {
  void operator()(Place place) const {
    auto &pool = platform::DeviceContextPool::Instance();
    if (std::is_same<Place, platform::CPUPlace>::value) {
-      Apply(static_cast<platform::CPUDeviceContext *>(pool.Get(place)));
+      Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));

--- a/paddle/fluid/operators/assign_op_test.cc
+++ b/paddle/fluid/operators/assign_op_test.cc
@@ -22,7 +22,7 @@ limitations under the License. */

 TEST(AssignOp, AssignLoDTensor) {
  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);

  paddle::framework::Variable output;
  paddle::operators::AssignFunctor assign_functor(&output, ctx);
@@ -47,7 +47,7 @@ TEST(AssignOp, AssignLoDTensor) {

 TEST(AssignOp, AssignLoDTensorArray) {
  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);

  paddle::framework::Variable output;
  paddle::operators::AssignFunctor assign_functor(&output, ctx);
@@ -78,7 +78,7 @@ TEST(AssignOp, AssignLoDTensorArray) {

 TEST(AssignOp, AssignSelectedRows) {
  paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  phi::CPUContext ctx(cpu_place);

  paddle::framework::Variable output;
  paddle::operators::AssignFunctor assign_functor(&output, ctx);

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -337,7 +337,7 @@ template <typename T>
 class AttentionLSTMKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
+    using DeviceContext = phi::CPUContext;

    auto* x = ctx.Input<LoDTensor>("X");
    auto* h0 = ctx.Input<Tensor>("H0");
@@ -416,10 +416,10 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());

-    auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);

    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();
    phi::funcs::FCFunctor<DeviceContext, T> fc;
    fc(dev_ctx,
       total_T,

--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace operators {

 template <>
-void GetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t* num_updates,
-    int64_t* num_accumulates,
-    int64_t* old_num_accumulates) {
+void GetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
+                                      int64_t* num_updates,
+                                      int64_t* num_accumulates,
+                                      int64_t* old_num_accumulates) {
  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
@@ -33,11 +32,10 @@ void GetAccumulators<paddle::platform::CPUDeviceContext>(
 }

 template <>
-void SetAccumulators<paddle::platform::CPUDeviceContext>(
-    const framework::ExecutionContext& ctx,
-    int64_t num_updates,
-    int64_t num_accumulates,
-    int64_t old_num_accumulates) {
+void SetAccumulators<phi::CPUContext>(const framework::ExecutionContext& ctx,
+                                      int64_t num_updates,
+                                      int64_t num_accumulates,
+                                      int64_t old_num_accumulates) {
  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
@@ -217,7 +215,6 @@ REGISTER_OPERATOR(
    ops::AverageAccumulatesOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    average_accumulates,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(average_accumulates,
+                       ops::AverageAccumulatesKernel<phi::CPUContext, float>,
+                       ops::AverageAccumulatesKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/batch_fc_op.cc
+++ b/paddle/fluid/operators/batch_fc_op.cc
@@ -166,7 +166,6 @@ REGISTER_OPERATOR(batch_fc_grad,
                  ops::BatchFCGradOp,
                  ops::BatchFCGradOpNoNeedBufferVarsInferer);

-REGISTER_OP_CPU_KERNEL(
-    batch_fc,
-    ops::BatchFCKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchFCKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(batch_fc,
+                       ops::BatchFCKernel<phi::CPUContext, float>,
+                       ops::BatchFCKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -141,7 +141,7 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(

  auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
      new paddle::platform::CPUPlace());
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+  phi::CPUContext cpu_ctx(*cpu_place);

  framework::LoD lod;
  lod.push_back(source_level_lod);

--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -143,9 +143,8 @@ REGISTER_OPERATOR(beam_search,
                  ops::BeamSearchOp,
                  ops::BeamSearchOpMaker,
                  ops::BeamSearchInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(beam_search,
+                       ops::BeamSearchOpKernel<phi::CPUContext, float>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, double>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, int>,
+                       ops::BeamSearchOpKernel<phi::CPUContext, int64_t>);
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -172,11 +172,9 @@ REGISTER_OPERATOR(bmm,
                  ops::BmmOpGradMaker<paddle::framework::OpDesc>,
                  ops::BmmOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(bmm_grad, ops::BmmOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    bmm,
-    ops::BmmKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BmmKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    bmm_grad,
-    ops::BmmGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BmmGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(bmm,
+                       ops::BmmKernel<phi::CPUContext, float>,
+                       ops::BmmKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(bmm_grad,
+                       ops::BmmGradKernel<phi::CPUContext, float>,
+                       ops::BmmGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -176,7 +176,7 @@ class BprLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;

 REGISTER_OPERATOR(bpr_loss,
                  ops::BprLossOp,

--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -141,7 +141,7 @@ class CastOp : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 // cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here.
 REGISTER_OPERATOR(cast,

--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -146,7 +146,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(CenterLossGradNoNeedBufVarsInferer, "X");
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;

 REGISTER_OPERATOR(center_loss,
                  ops::CenterLossOp,

--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -113,13 +113,11 @@ It accomplishes the execution of the instruction according to the following step
 }  // namespace paddle::operators

 namespace ops = paddle::operators;
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(
    cinn_instruction_run,
    ops::CinnInstructionRunOp,
    ops::CinnInstructionRunOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    cinn_instruction_run,
-    ops::CinnInstructionRunOpKernel<CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cinn_instruction_run,
+                       ops::CinnInstructionRunOpKernel<phi::CPUContext, float>);
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -189,6 +189,5 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 /* see [Why use single type kernel] */
-REGISTER_OP_CPU_KERNEL(
-    cinn_launch,
-    ops::CinnLaunchOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cinn_launch,
+                       ops::CinnLaunchOpKernel<phi::CPUContext, float>);
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -19,6 +19,5 @@ REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm,
                             ops::ClipByNormOp,
                             ops::ClipByNormOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    clip_by_norm,
-    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(clip_by_norm,
+                       ops::ClipByNormKernel<phi::CPUContext, float>);
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -511,11 +511,10 @@ REGISTER_OPERATOR(coalesce_tensor,
                  paddle::operators::CoalesceTensorOpMaker);
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CPU_KERNEL(
-    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(coalesce_tensor,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
+                       ops::CoalesceTensorOpKernel<phi::CPUContext, double>);

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
@@ -550,20 +549,18 @@ REGISTER_OP_XPU_KERNEL(
 #if defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_OP_NPU_KERNEL(
    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, float>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, double>);
 #endif

 #if defined(PADDLE_WITH_MLU)
 REGISTER_OP_MLU_KERNEL(
    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
 #endif

 REGISTER_OP_VERSION(coalesce_tensor)

--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -73,10 +73,9 @@ REGISTER_OP_WITHOUT_GRADIENT(allreduce,
                             ops::AllReduceOp,
                             ops::AllReduceOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    allreduce,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
+REGISTER_OP_CPU_KERNEL(allreduce,
+                       ops::AllReduceOpKernel<phi::CPUContext, float>,
+                       ops::AllReduceOpKernel<phi::CPUContext, double>,
+                       ops::AllReduceOpKernel<phi::CPUContext, int>,
+                       ops::AllReduceOpKernel<phi::CPUContext, int64_t>,
+                       ops::AllReduceOpKernel<phi::CPUContext, plat::float16>);
--- a/paddle/fluid/operators/complex_op.cc
+++ b/paddle/fluid/operators/complex_op.cc
@@ -143,12 +143,10 @@ REGISTER_OPERATOR(complex,

 REGISTER_OPERATOR(complex_grad, ops::ComplexGradOp);

-REGISTER_OP_CPU_KERNEL(
-    complex,
-    ops::ComplexKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ComplexKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    complex_grad,
-    ops::ComplexGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ComplexGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(complex,
+                       ops::ComplexKernel<phi::CPUContext, float>,
+                       ops::ComplexKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(complex_grad,
+                       ops::ComplexGradKernel<phi::CPUContext, float>,
+                       ops::ComplexGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -161,12 +161,10 @@ REGISTER_OPERATOR(as_real,
                  ops::AsRealGradMaker<paddle::framework::OpDesc>,
                  ops::AsRealGradMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(
-    as_complex,
-    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AsComplexKernel<paddle::platform::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    as_real,
-    ops::AsRealKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::AsRealKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(as_complex,
+                       ops::AsComplexKernel<phi::CPUContext, float>,
+                       ops::AsComplexKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(as_real,
+                       ops::AsRealKernel<phi::CPUContext, float>,
+                       ops::AsRealKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -249,8 +249,6 @@ REGISTER_OPERATOR(cos_sim,
                  ops::CosSimGradOpMaker<paddle::framework::OpDesc>,
                  ops::CosSimGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
-REGISTER_OP_CPU_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(cos_sim, ops::CosSimKernel<phi::CPUContext, float>);
+REGISTER_OP_CPU_KERNEL(cos_sim_grad,
+                       ops::CosSimGradKernel<phi::CPUContext, float>);
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -215,7 +215,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(crf_decoding,
                             ops::CRFDecodingOp,
                             ops::CRFDecodingOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    crf_decoding,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(crf_decoding,
+                       ops::CRFDecodingOpKernel<phi::CPUContext, float>,
+                       ops::CRFDecodingOpKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -223,14 +223,12 @@ REGISTER_OPERATOR(crop,
                  ops::CropGradOpMaker<paddle::imperative::OpBase>,
                  ops::GropNoNeedBufferVarInferer);
 REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop,
-    ops::CropKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    crop_grad,
-    ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(crop,
+                       ops::CropKernel<phi::CPUContext, float>,
+                       ops::CropKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(crop_grad,
+                       ops::CropGradKernel<phi::CPUContext, float>,
+                       ops::CropGradKernel<phi::CPUContext, double>);

 REGISTER_OP_CUDA_KERNEL(
    crop,

--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -320,18 +320,16 @@ REGISTER_OPERATOR(crop_tensor,
                  ops::CropTensorGradOpMaker<paddle::framework::OpDesc>,
                  ops::CropTensorGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CropTensorKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(crop_tensor,
+                       ops::CropTensorKernel<phi::CPUContext, float>,
+                       ops::CropTensorKernel<phi::CPUContext, double>,
+                       ops::CropTensorKernel<phi::CPUContext, int>,
+                       ops::CropTensorKernel<phi::CPUContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(crop_tensor_grad,
+                       ops::CropTensorGradKernel<phi::CPUContext, float>,
+                       ops::CropTensorGradKernel<phi::CPUContext, double>,
+                       ops::CropTensorGradKernel<phi::CPUContext, int>,
+                       ops::CropTensorGradKernel<phi::CPUContext, int64_t>);

 REGISTER_OP_CUDA_KERNEL(
    crop_tensor,

--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -421,7 +421,7 @@ class CrossEntropyGradOpMaker2 : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPUCtx = paddle::platform::CPUDeviceContext;
+using CPUCtx = phi::CPUContext;

 REGISTER_OPERATOR(cross_entropy,
                  ops::CrossEntropyOpBase,

--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -129,7 +129,6 @@ REGISTER_OPERATOR(
    ops::CTCAlignOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    ctc_align,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(ctc_align,
+                       ops::CTCAlignKernel<phi::CPUContext, int>,
+                       ops::CTCAlignKernel<phi::CPUContext, int64_t>);
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -145,7 +145,7 @@ class LogcumsumexpGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
                            CumsumInferShapeFunctor,
                            PD_INFER_META(phi::CumInferMeta));

--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -287,8 +287,7 @@ The required data format for this layer is one of the following:
 };

 template <typename T>
-class DataNormKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    // const bool is_test = ctx.Attr<bool>("is_test");
@@ -533,8 +532,7 @@ class DataNormGradOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-class DataNormGradKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormGradKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    const auto *x = ctx.Input<Tensor>("X");
@@ -788,14 +786,12 @@ REGISTER_OPERATOR(data_norm,
                  ops::DataNormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp);

-REGISTER_OP_CPU_KERNEL(
-    data_norm,
-    ops::DataNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    data_norm_grad,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DataNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(data_norm,
+                       ops::DataNormKernel<phi::CPUContext, float>,
+                       ops::DataNormKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(data_norm_grad,
+                       ops::DataNormGradKernel<phi::CPUContext, float>,
+                       ops::DataNormGradKernel<phi::CPUContext, double>);
 REGISTER_OP_VERSION(data_norm).AddCheckpoint(
    R"ROC(
              upgrad data_norm op by adding scale_w to support scale and shift.)ROC",

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -349,7 +349,7 @@ class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;
 REGISTER_OPERATOR(
    deformable_psroi_pooling,
    ops::DeformablePSROIPoolOp,

--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -33,8 +33,8 @@ namespace paddle {
 namespace operators {

 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                  const framework::Tensor* in,
                  const framework::Tensor* scale,
                  float max_range,
@@ -49,8 +49,8 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct DequantizeFunctor<platform::CPUDeviceContext, int8_t>;
-template struct DequantizeFunctor<platform::CPUDeviceContext, int16_t>;
+template struct DequantizeFunctor<phi::CPUContext, int8_t>;
+template struct DequantizeFunctor<phi::CPUContext, int16_t>;

 class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:
@@ -102,7 +102,7 @@ $$Out = \frac{scale*X}{ max\_range }$$
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    dequantize_abs_max,

--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -32,8 +32,8 @@ namespace paddle {
 namespace operators {

 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                  const framework::Tensor* in,
                  const framework::Tensor* dict,
                  framework::Tensor* out) {
@@ -51,7 +51,7 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct DequantizeFunctor<platform::CPUDeviceContext, int8_t>;
+template struct DequantizeFunctor<phi::CPUContext, int8_t>;

 class DequantizeLogOp : public framework::OperatorWithKernel {
 public:
@@ -108,7 +108,7 @@ This calculation is an opposite operation of QuantizeLogOp:
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    dequantize_log,

--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -200,7 +200,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");

-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();

    auto col = dist_mat->dims()[1];

@@ -216,9 +216,9 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
    match_indices->mutable_data<int>({n, col}, context.GetPlace());
    match_dist->mutable_data<T>({n, col}, context.GetPlace());

-    phi::funcs::SetConstant<platform::CPUDeviceContext, int> iset;
+    phi::funcs::SetConstant<phi::CPUContext, int> iset;
    iset(dev_ctx, match_indices, static_cast<int>(-1));
-    phi::funcs::SetConstant<platform::CPUDeviceContext, T> tset;
+    phi::funcs::SetConstant<phi::CPUContext, T> tset;
    tset(dev_ctx, match_dist, static_cast<T>(0));

    int* indices = match_indices->data<int>();

--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -104,7 +104,6 @@ REGISTER_OPERATOR(
    ops::BoxClipOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    box_clip,
-    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxClipKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_clip,
+                       ops::BoxClipKernel<phi::CPUContext, float>,
+                       ops::BoxClipKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -29,8 +29,7 @@ class BoxClipKernel : public framework::OpKernel<T> {
    auto* input_box = context.Input<LoDTensor>("Input");
    auto* im_info = context.Input<LoDTensor>("ImInfo");
    auto* output_box = context.Output<LoDTensor>("Output");
-    auto& dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::CPUContext>();
    output_box->mutable_data<T>(context.GetPlace());
    if (input_box->lod().size()) {
      PADDLE_ENFORCE_EQ(input_box->lod().size(),

--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -251,7 +251,6 @@ REGISTER_OPERATOR(
    ops::BoxCoderOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    box_coder,
-    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_coder,
+                       ops::BoxCoderKernel<phi::CPUContext, float>,
+                       ops::BoxCoderKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -227,7 +227,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(
-    box_decoder_and_assign,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(box_decoder_and_assign,
+                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, float>,
+                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -122,7 +122,7 @@ class GenerateMaskLabelsOp : public framework::OperatorWithKernel {
 * to encode class specific mask targets.
 */
 template <typename T>
-static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
+static inline void ExpandMaskTarget(const phi::CPUContext& ctx,
                                    const Tensor& masks,
                                    const Tensor& mask_class_labels,
                                    const int resolution,
@@ -150,7 +150,7 @@ static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
 }

 template <typename T>
-std::vector<Tensor> SampleMaskForOneImage(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> SampleMaskForOneImage(const phi::CPUContext& ctx,
                                          const Tensor& im_info,
                                          const Tensor& gt_classes,
                                          const Tensor& is_crowd,
@@ -391,7 +391,7 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
    std::vector<size_t> lod0(1, 0);

    int64_t num_mask = 0;
-    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::CPUContext>();

    auto gt_classes_lod = gt_classes->lod().back();
    auto is_crowd_lod = is_crowd->lod().back();

--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -168,7 +168,7 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-void Concat(const platform::CPUDeviceContext& context,
+void Concat(const phi::CPUContext& context,
            const Tensor& in_tensor_a,
            const Tensor& in_tensor_b,
            Tensor* out_tensor) {
@@ -176,24 +176,23 @@ void Concat(const platform::CPUDeviceContext& context,
  std::vector<Tensor> inputs;
  inputs.emplace_back(in_tensor_a);
  inputs.emplace_back(in_tensor_b);
-  math::ConcatFunctor<platform::CPUDeviceContext, T> concat_functor;
+  math::ConcatFunctor<phi::CPUContext, T> concat_functor;
  concat_functor(context, inputs, axis, out_tensor);
 }

 template <typename T>
-std::vector<std::vector<int>> SampleFgBgGt(
-    const platform::CPUDeviceContext& context,
-    Tensor* iou,
-    const Tensor& is_crowd,
-    const int batch_size_per_im,
-    const float fg_fraction,
-    const float fg_thresh,
-    const float bg_thresh_hi,
-    const float bg_thresh_lo,
-    std::minstd_rand engine,
-    const bool use_random,
-    const bool is_cascade_rcnn,
-    const Tensor& rpn_rois) {
+std::vector<std::vector<int>> SampleFgBgGt(const phi::CPUContext& context,
+                                           Tensor* iou,
+                                           const Tensor& is_crowd,
+                                           const int batch_size_per_im,
+                                           const float fg_fraction,
+                                           const float fg_thresh,
+                                           const float bg_thresh_hi,
+                                           const float bg_thresh_lo,
+                                           std::minstd_rand engine,
+                                           const bool use_random,
+                                           const bool is_cascade_rcnn,
+                                           const Tensor& rpn_rois) {
  std::vector<int> fg_inds;
  std::vector<int> bg_inds;
  std::vector<int> mapped_gt_inds;
@@ -286,7 +285,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
 }

 template <typename T>
-void GatherBoxesLabels(const platform::CPUDeviceContext& context,
+void GatherBoxesLabels(const phi::CPUContext& context,
                       const Tensor& boxes,
                       const Tensor& max_overlap,
                       const Tensor& gt_boxes,
@@ -335,7 +334,7 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,

 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
-    const platform::CPUDeviceContext& context,
+    const phi::CPUContext& context,
    const Tensor& rpn_rois_in,
    const Tensor& gt_classes,
    const Tensor& is_crowd,
@@ -372,7 +371,7 @@ std::vector<Tensor> SampleRoisForOneImage(
    Tensor roi_filter;
    // Tensor box_filter;
    if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
      set_zero(context, &roi_filter, static_cast<T>(0));
    } else {
@@ -597,7 +596,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
    std::vector<size_t> lod0(1, 0);

    int64_t num_rois = 0;
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();

    auto rpn_rois_lod = rpn_rois->lod().back();
    auto gt_classes_lod = gt_classes->lod().back();

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -98,8 +98,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    float min_size = context.Attr<float>("min_size");
    float eta = context.Attr<float>("eta");

-    auto &dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto &dev_ctx = context.template device_context<phi::CPUContext>();

    auto &scores_dim = scores->dims();
    int64_t num = scores_dim[0];
@@ -122,7 +121,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                dev_ctx.GetPlace());

-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans;
    std::vector<int> axis = {0, 2, 3, 1};
    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
    trans(dev_ctx, *scores, &scores_swap, axis);
@@ -181,7 +180,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
  }

  std::pair<Tensor, Tensor> ProposalForOneImage(
-      const platform::CPUDeviceContext &ctx,
+      const phi::CPUContext &ctx,
      const Tensor &im_info_slice,
      const Tensor &anchors,
      const Tensor &variances,
@@ -234,7 +233,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
    // Handle the case when there is no keep index left
    if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
      set_zero(ctx, &bbox_sel, static_cast<T>(0));
      Tensor scores_filter;

--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -99,8 +99,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    float eta = context.Attr<float>("eta");
    bool pixel_offset = context.Attr<bool>("pixel_offset");

-    auto &dev_ctx =
-        context.template device_context<platform::CPUDeviceContext>();
+    auto &dev_ctx = context.template device_context<phi::CPUContext>();

    auto &scores_dim = scores->dims();
    int64_t num = scores_dim[0];
@@ -123,7 +122,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                dev_ctx.GetPlace());

-    phi::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    phi::funcs::Transpose<phi::CPUContext, T, 4> trans;
    std::vector<int> axis = {0, 2, 3, 1};
    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
    trans(dev_ctx, *scores, &scores_swap, axis);
@@ -183,7 +182,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
  }

  std::pair<Tensor, Tensor> ProposalForOneImage(
-      const platform::CPUDeviceContext &ctx,
+      const phi::CPUContext &ctx,
      const Tensor &im_shape_slice,
      const Tensor &anchors,
      const Tensor &variances,
@@ -240,7 +239,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
        ctx, &proposals, min_size, im_shape_slice, false, &keep, pixel_offset);
    // Handle the case when there is no keep index left
    if (keep.numel() == 0) {
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      phi::funcs::SetConstant<phi::CPUContext, T> set_zero;
      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
      set_zero(ctx, &bbox_sel, static_cast<T>(0));
      Tensor scores_filter;

--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -113,7 +113,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(iou_similarity,
+                       ops::IOUSimilarityKernel<phi::CPUContext, float>,
+                       ops::IOUSimilarityKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -356,7 +356,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel<T> {
    auto* outs = ctx.Output<LoDTensor>("Out");
    auto& score_dims = scores_input->dims();
    auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();

    LoDTensor scores;
    LoDTensor boxes;

--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -403,7 +403,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(
-    mine_hard_examples,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(mine_hard_examples,
+                       ops::MineHardExamplesKernel<phi::CPUContext, float>,
+                       ops::MineHardExamplesKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -219,7 +219,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();

    int num_det = 0;

@@ -361,7 +361,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
    auto rois_num = ctx.Input<Tensor>("RoisNum");
    auto score_dims = scores->dims();
    auto score_size = score_dims.size();
-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();

    std::vector<std::map<int, std::vector<int>>> all_indices;
    std::vector<size_t> batch_starts = {0};

--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -507,7 +507,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel<T> {
    int64_t box_dim = box_dims[2];
    int64_t out_dim = box_dim + 2;

-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::CPUContext>();

    std::vector<std::vector<std::vector<T>>> all_nmsed_out;
    std::vector<size_t> batch_starts = {0};

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -112,12 +112,11 @@ void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
 }

 template <typename T>
-std::vector<Tensor> FilterStraddleAnchor(
-    const platform::CPUDeviceContext& context,
-    const Tensor* anchor,
-    const float rpn_straddle_thresh,
-    T im_height,
-    T im_width) {
+std::vector<Tensor> FilterStraddleAnchor(const phi::CPUContext& context,
+                                         const Tensor* anchor,
+                                         const float rpn_straddle_thresh,
+                                         T im_height,
+                                         T im_width) {
  std::vector<int> inds_inside;
  int anchor_num = anchor->dims()[0];
  auto* anchor_data = anchor->data<T>();
@@ -154,7 +153,7 @@ std::vector<Tensor> FilterStraddleAnchor(
 }

 template <typename T>
-Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
+Tensor FilterCrowdGt(const phi::CPUContext& context,
                     Tensor* gt_boxes,
                     Tensor* is_crowd) {
  int gt_num = gt_boxes->dims()[0];
@@ -300,7 +299,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
 }

 template <typename T>
-std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> SampleRpnFgBgGt(const phi::CPUContext& ctx,
                                    const Tensor& anchor_by_gt_overlap,
                                    const int rpn_batch_size_per_im,
                                    const float rpn_positive_overlap,
@@ -437,7 +436,7 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
    tgt_bbox->mutable_data<T>({max_num, 4}, place);
    tgt_lbl->mutable_data<int>({max_num, 1}, place);
    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();

    std::random_device rnd;
    std::minstd_rand engine;
@@ -857,11 +856,10 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
 };

 template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(
-    const platform::CPUDeviceContext& context,
-    Tensor* gt_boxes,
-    Tensor* gt_labels,
-    Tensor* is_crowd) {
+std::vector<Tensor> FilterCrowdGtBoxLabel(const phi::CPUContext& context,
+                                          Tensor* gt_boxes,
+                                          Tensor* gt_labels,
+                                          Tensor* is_crowd) {
  int gt_num = gt_boxes->dims()[0];
  std::vector<int> not_crowd_inds;
  auto* is_crowd_data = is_crowd->data<int>();
@@ -893,7 +891,7 @@ std::vector<Tensor> FilterCrowdGtBoxLabel(
 }

 template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const platform::CPUDeviceContext& ctx,
+std::vector<Tensor> GetAllFgBgGt(const phi::CPUContext& ctx,
                                 const Tensor& anchor_by_gt_overlap,
                                 const Tensor& ncrowd_gt_labels,
                                 const float positive_overlap,
@@ -1044,7 +1042,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
    tgt_lbl->mutable_data<int>({max_num, 1}, place);
    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
    fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = context.device_context<phi::CPUContext>();

    std::random_device rnd;
    std::minstd_rand engine;

--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc
@@ -266,12 +266,10 @@ REGISTER_OPERATOR(sigmoid_focal_loss,
                  ops::SigmoidFocalLossGradOpMaker<paddle::framework::OpDesc>,
                  ops::SigmoidFocalLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_focal_loss,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(sigmoid_focal_loss,
+                       ops::SigmoidFocalLossKernel<phi::CPUContext, float>,
+                       ops::SigmoidFocalLossKernel<phi::CPUContext, double>);
 REGISTER_OP_CPU_KERNEL(
    sigmoid_focal_loss_grad,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SigmoidFocalLossGradKernel<paddle::platform::CPUDeviceContext,
-                                    double>);
+    ops::SigmoidFocalLossGradKernel<phi::CPUContext, float>,
+    ops::SigmoidFocalLossGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -149,8 +149,8 @@ for i-th instance and each `id` of NegIndices in this instance:
 };

 template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
-  void operator()(const platform::CPUDeviceContext& ctx,
+struct NegTargetAssignFunctor<phi::CPUContext, T, WT> {
+  void operator()(const phi::CPUContext& ctx,
                  const int* neg_indices,
                  const size_t* lod,
                  const int N,
@@ -172,10 +172,8 @@ struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
  }
 };

-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext,
-                                       float,
-                                       float>;
+template struct NegTargetAssignFunctor<phi::CPUContext, int, float>;
+template struct NegTargetAssignFunctor<phi::CPUContext, float, float>;

 }  // namespace operators
 }  // namespace paddle
@@ -187,7 +185,6 @@ REGISTER_OPERATOR(
    ops::TargetAssignOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
+REGISTER_OP_CPU_KERNEL(target_assign,
+                       ops::TargetAssignKernel<phi::CPUContext, int, float>,
+                       ops::TargetAssignKernel<phi::CPUContext, float, float>);
--- a/paddle/fluid/operators/determinant_op.cc
+++ b/paddle/fluid/operators/determinant_op.cc
@@ -179,12 +179,10 @@ REGISTER_OPERATOR(slogdeterminant,
 REGISTER_OPERATOR(slogdeterminant_grad,
                  ops::SlogDeterminantGradOp)  // reuse det grad op

-REGISTER_OP_CPU_KERNEL(
-    slogdeterminant,
-    ops::SlogDeterminantKernel<plat::CPUDeviceContext, float>,
-    ops::SlogDeterminantKernel<plat::CPUDeviceContext, double>);
-
-REGISTER_OP_CPU_KERNEL(
-    slogdeterminant_grad,
-    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, float>,
-    ops::SlogDeterminantGradKernel<plat::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(slogdeterminant,
+                       ops::SlogDeterminantKernel<phi::CPUContext, float>,
+                       ops::SlogDeterminantKernel<phi::CPUContext, double>);
+
+REGISTER_OP_CPU_KERNEL(slogdeterminant_grad,
+                       ops::SlogDeterminantGradKernel<phi::CPUContext, float>,
+                       ops::SlogDeterminantGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc
@@ -66,6 +66,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm,
                             ops::DGCClipByNormOp,
                             ops::DGCClipByNormOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    dgc_clip_by_norm,
-    ops::DGCClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(dgc_clip_by_norm,
+                       ops::DGCClipByNormKernel<phi::CPUContext, float>);
--- a/paddle/fluid/operators/diag_embed_op.cc
+++ b/paddle/fluid/operators/diag_embed_op.cc
@@ -138,9 +138,8 @@ REGISTER_OPERATOR(
    ops::DiagEmbedOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag_embed,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagEmbedKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(diag_embed,
+                       ops::DiagEmbedKernel<phi::CPUContext, int>,
+                       ops::DiagEmbedKernel<phi::CPUContext, float>,
+                       ops::DiagEmbedKernel<phi::CPUContext, double>,
+                       ops::DiagEmbedKernel<phi::CPUContext, int64_t>);
--- a/paddle/fluid/operators/diag_op.cc
+++ b/paddle/fluid/operators/diag_op.cc
@@ -59,9 +59,8 @@ REGISTER_OPERATOR(
    ops::DiagOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    diag,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DiagKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(diag,
+                       ops::DiagKernel<phi::CPUContext, int>,
+                       ops::DiagKernel<phi::CPUContext, float>,
+                       ops::DiagKernel<phi::CPUContext, double>,
+                       ops::DiagKernel<phi::CPUContext, int64_t>);
--- a/paddle/fluid/operators/dirichlet_op.cc
+++ b/paddle/fluid/operators/dirichlet_op.cc
@@ -42,11 +42,11 @@ struct GammaCPUFunctor {
 };

 template <typename T>
-struct DirichletSampler<platform::CPUDeviceContext, T> {
+struct DirichletSampler<phi::CPUContext, T> {
  void operator()(const framework::ExecutionContext& ctx,
                  const Tensor* alpha,
                  Tensor* out) {
-    auto& dev_ctx = ctx.device_context<platform::CPUDeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::CPUContext>();

    auto p_gen = framework::DefaultCPUGenerator();
    auto generator = p_gen->GetCPUEngine();
@@ -71,8 +71,7 @@ struct DirichletSampler<platform::CPUDeviceContext, T> {
        gamma_samples.data<T>(),
        standard_uniform,
        standard_normal);
-    platform::ForRange<platform::CPUDeviceContext> for_range(dev_ctx,
-                                                             alpha->numel());
+    platform::ForRange<phi::CPUContext> for_range(dev_ctx, alpha->numel());
    for_range(gamma_functor);

    // normalize them into a simplex, along the last axis
@@ -81,10 +80,10 @@ struct DirichletSampler<platform::CPUDeviceContext, T> {
    new_shape[new_shape.size() - 1] = 1;
    gamma_sum.mutable_data<T>(new_shape, dev_ctx.GetPlace());

-    ReduceKernelFunctor<platform::CPUDeviceContext, T, SumFunctor>(
+    ReduceKernelFunctor<phi::CPUContext, T, SumFunctor>(
        &gamma_samples, &gamma_sum, {new_shape.size() - 1}, true, false, ctx)
        .template apply<T>();
-    ElementwiseComputeEx<DivFunctor<T>, platform::CPUDeviceContext, T, T>(
+    ElementwiseComputeEx<DivFunctor<T>, phi::CPUContext, T, T>(
        ctx, &gamma_samples, &gamma_sum, -1, DivFunctor<T>(), out);
  }
 };
@@ -125,7 +124,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dirichlet,
                             paddle::operators::DirichletOpMaker);
 REGISTER_OP_CPU_KERNEL(
    dirichlet,
-    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
-                                       float>,
-    paddle::operators::DirichletKernel<paddle::platform::CPUDeviceContext,
-                                       double>);
+    paddle::operators::DirichletKernel<phi::CPUContext, float>,
+    paddle::operators::DirichletKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -91,7 +91,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(Dropout, CPUDense) {
  f::Scope scope;
  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
+  phi::CPUContext ctx(place);
  Compare(scope, ctx);
 }


--- a/paddle/fluid/operators/eig_op.cc
+++ b/paddle/fluid/operators/eig_op.cc
@@ -164,19 +164,15 @@ REGISTER_OPERATOR(eig,

 REGISTER_OPERATOR(eig_grad, ops::EigGradOp);

-REGISTER_OP_CPU_KERNEL(
-    eig,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, float, complex64>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, double, complex128>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, complex64, complex64>,
-    ops::EigKernel<paddle::platform::CPUDeviceContext, complex128, complex128>);
+REGISTER_OP_CPU_KERNEL(eig,
+                       ops::EigKernel<phi::CPUContext, float, complex64>,
+                       ops::EigKernel<phi::CPUContext, double, complex128>,
+                       ops::EigKernel<phi::CPUContext, complex64, complex64>,
+                       ops::EigKernel<phi::CPUContext, complex128, complex128>);

 REGISTER_OP_CPU_KERNEL(
    eig_grad,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext, float, complex64>,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext, double, complex128>,
-    ops::
-        EigGradKernel<paddle::platform::CPUDeviceContext, complex64, complex64>,
-    ops::EigGradKernel<paddle::platform::CPUDeviceContext,
-                       complex128,
-                       complex128>);
+    ops::EigGradKernel<phi::CPUContext, float, complex64>,
+    ops::EigGradKernel<phi::CPUContext, double, complex128>,
+    ops::EigGradKernel<phi::CPUContext, complex64, complex64>,
+    ops::EigGradKernel<phi::CPUContext, complex128, complex128>);
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -70,7 +70,7 @@ void TransposeTwoAxis(const Tensor& input,
  permute[axis2] = axis1;

  transposed_input->mutable_data<T>(input.dims(), context.GetPlace());
-  auto& dev_ctx = context.template device_context<platform::CPUDeviceContext>();
+  auto& dev_ctx = context.template device_context<phi::CPUContext>();

  TransCompute<DeviceContext, T>(
      input.dims().size(), dev_ctx, input, transposed_input, permute);

--- a/paddle/fluid/operators/eigvals_op.cc
+++ b/paddle/fluid/operators/eigvals_op.cc
@@ -86,10 +86,9 @@ REGISTER_OPERATOR(eigvals,
                  ops::EigvalsOp,
                  ops::EigvalsOpMaker,
                  ops::EigvalsOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(eigvals,
-                       ops::EigvalsKernel<plat::CPUDeviceContext, float>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext, double>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext,
-                                          paddle::platform::complex<float>>,
-                       ops::EigvalsKernel<plat::CPUDeviceContext,
-                                          paddle::platform::complex<double>>);
+REGISTER_OP_CPU_KERNEL(
+    eigvals,
+    ops::EigvalsKernel<phi::CPUContext, float>,
+    ops::EigvalsKernel<phi::CPUContext, double>,
+    ops::EigvalsKernel<phi::CPUContext, paddle::platform::complex<float>>,
+    ops::EigvalsKernel<phi::CPUContext, paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/eigvalsh_op.cc
+++ b/paddle/fluid/operators/eigvalsh_op.cc
@@ -151,24 +151,23 @@ REGISTER_OPERATOR(eigvalsh,
                  ops::EigvalshGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(eigvalsh_grad, ops::EigvalshGradOp);

-REGISTER_OP_CPU_KERNEL(
-    eigvalsh,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext,
-                        float,
-                        paddle::platform::complex<float>>,
-    ops::EigvalshKernel<paddle::platform::CPUDeviceContext,
-                        double,
-                        paddle::platform::complex<double>>);
+REGISTER_OP_CPU_KERNEL(eigvalsh,
+                       ops::EigvalshKernel<phi::CPUContext, float, float>,
+                       ops::EigvalshKernel<phi::CPUContext, double, double>,
+                       ops::EigvalshKernel<phi::CPUContext,
+                                           float,
+                                           paddle::platform::complex<float>>,
+                       ops::EigvalshKernel<phi::CPUContext,
+                                           double,
+                                           paddle::platform::complex<double>>);

 REGISTER_OP_CPU_KERNEL(
    eigvalsh_grad,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, float, float>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext, double, double>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext,
+    ops::EigvalshGradKernel<phi::CPUContext, float, float>,
+    ops::EigvalshGradKernel<phi::CPUContext, double, double>,
+    ops::EigvalshGradKernel<phi::CPUContext,
                            float,
                            paddle::platform::complex<float>>,
-    ops::EigvalshGradKernel<paddle::platform::CPUDeviceContext,
+    ops::EigvalshGradKernel<phi::CPUContext,
                            double,
                            paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -146,19 +146,17 @@ REGISTER_OPERATOR(expand_as,
 REGISTER_OPERATOR(expand_as_grad,
                  ops::ExpandAsGradOp,
                  ops::ExpandAsGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand_as,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(expand_as,
+                       ops::ExpandAsKernel<phi::CPUContext, float>,
+                       ops::ExpandAsKernel<phi::CPUContext, double>,
+                       ops::ExpandAsKernel<phi::CPUContext, int>,
+                       ops::ExpandAsKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandAsKernel<phi::CPUContext, bool>);
+REGISTER_OP_CPU_KERNEL(expand_as_grad,
+                       ops::ExpandAsGradKernel<phi::CPUContext, int>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, float>,
+                       ops::ExpandAsGradKernel<phi::CPUContext, double>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
    expand_as,

--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -280,19 +280,17 @@ REGISTER_OPERATOR(expand_grad,
                  ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
                  ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                  ops::ExpandGradNoNeedBufVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    expand,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CPUDeviceContext, bool>);
-REGISTER_OP_CPU_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<phi::CPUContext, float>,
+                       ops::ExpandKernel<phi::CPUContext, double>,
+                       ops::ExpandKernel<phi::CPUContext, int>,
+                       ops::ExpandKernel<phi::CPUContext, int64_t>,
+                       ops::ExpandKernel<phi::CPUContext, bool>);
+REGISTER_OP_CPU_KERNEL(expand_grad,
+                       ops::ExpandGradKernel<phi::CPUContext, float>,
+                       ops::ExpandGradKernel<phi::CPUContext, double>,
+                       ops::ExpandGradKernel<phi::CPUContext, int>,
+                       ops::ExpandGradKernel<phi::CPUContext, int64_t>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
    expand,

--- a/paddle/fluid/operators/exponential_op.cc
+++ b/paddle/fluid/operators/exponential_op.cc
@@ -62,8 +62,7 @@ class ExponentialOpInferVarType
 };

 template <typename T>
-class ExponentialKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
+class ExponentialKernel<phi::CPUContext, T> : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
    auto *out = ctx.Output<framework::Tensor>("Out");
@@ -135,9 +134,8 @@ REGISTER_OPERATOR(exponential_grad,
                  ExponentialGradInferer);

 REGISTER_OP_CPU_KERNEL(exponential,
-                       ops::ExponentialKernel<plat::CPUDeviceContext, float>,
-                       ops::ExponentialKernel<plat::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    exponential_grad,
-    ops::ExponentialGradKernel<plat::CPUDeviceContext, float>,
-    ops::ExponentialGradKernel<plat::CPUDeviceContext, double>);
+                       ops::ExponentialKernel<phi::CPUContext, float>,
+                       ops::ExponentialKernel<phi::CPUContext, double>);
+REGISTER_OP_CPU_KERNEL(exponential_grad,
+                       ops::ExponentialGradKernel<phi::CPUContext, float>,
+                       ops::ExponentialGradKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -23,8 +23,8 @@ namespace paddle {
 namespace operators {

 template <typename T>
-struct DequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                  const framework::Tensor* in,
                  const framework::Tensor* scale,
                  T max_range,
@@ -39,8 +39,8 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
 };

 template <typename T>
-struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& dev_ctx,
+struct ChannelDequantizeFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext& dev_ctx,
                  const framework::Tensor* in,
                  const framework::Tensor** scales,
                  const int scale_num,
@@ -139,10 +139,10 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct DequantizeFunctor<phi::CPUContext, float>;
+template struct DequantizeFunctor<phi::CPUContext, double>;
+template struct ChannelDequantizeFunctor<phi::CPUContext, float>;
+template struct ChannelDequantizeFunctor<phi::CPUContext, double>;

 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:
@@ -269,7 +269,7 @@ Notes: In general, the per-channel quantization is only applied to weights and t
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    fake_dequantize_max_abs,

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -32,8 +32,8 @@ struct Compare {
 };

 template <typename T>
-struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const T *in,
                  const int num,
                  T *out) {
@@ -41,11 +41,11 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindAbsMaxFunctor<phi::CPUContext, float>;

 template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindChannelAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in_tensor,
                  const int quant_axis,
                  T *out_abs_max) {
@@ -86,11 +86,11 @@ struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindChannelAbsMaxFunctor<phi::CPUContext, float>;

 template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ClipAndFakeQuantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in,
                  const framework::Tensor &scale,
                  const int bin_cnt,
@@ -98,7 +98,7 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
                  framework::Tensor *out) {
    T s = scale.data<T>()[0];
    T inv_s = inverse(s);
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
    if (round_type == 0) {
      trans(ctx,
            in.data<T>(),
@@ -117,11 +117,11 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+template struct ClipAndFakeQuantFunctor<phi::CPUContext, float>;

 template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in,
                  const framework::Tensor &scale,
                  const int bin_cnt,
@@ -130,7 +130,7 @@ struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
    T s = scale.data<T>()[0];
    T inv_s = inverse(s);

-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
    if (round_type == 0) {
      trans(ctx,
            in.data<T>(),
@@ -151,12 +151,11 @@ struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
    }
  }
 };
-template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct ClipAndFakeQuantDequantFunctor<phi::CPUContext, float>;

 template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in,
                  const framework::Tensor &scale,
                  const int bin_cnt,
@@ -176,7 +175,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
    auto in_dims = in.dims();
    const int64_t channel = in_dims[quant_axis];
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
    if (quant_axis == 0) {
      const int64_t channel_size = in.numel() / channel;
      for (int64_t i = 0; i < channel; i++) {
@@ -235,11 +234,10 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct ChannelClipAndFakeQuantFunctor<phi::CPUContext, float>;
 template <typename T>
-struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in,
                  const framework::Tensor &scale,
                  const int bin_cnt,
@@ -258,7 +256,7 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
    auto in_dims = in.dims();
    const int64_t channel = in_dims[quant_axis];
-    platform::Transform<platform::CPUDeviceContext> trans;
+    platform::Transform<phi::CPUContext> trans;
    if (quant_axis == 0) {
      const int64_t channel_size = in.numel() / channel;
      for (int i = 0; i < channel; i++) {
@@ -326,11 +324,10 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                                   float>;
+template struct ChannelClipFakeQuantDequantFunctor<phi::CPUContext, float>;
 template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindRangeAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &cur_scale,
                  const framework::Tensor &last_scale,
                  const framework::Tensor &iter,
@@ -349,18 +346,17 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
      max = cur;
    } else if (fabs(removed - max) < 1e-6) {
      int size = (it > window_size) ? window_size : it;
-      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(
-          ctx, scale_arr, size, &max);
+      FindAbsMaxFunctor<phi::CPUContext, T>()(ctx, scale_arr, size, &max);
    }
    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
  }
 };

-template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template struct FindRangeAbsMaxFunctor<phi::CPUContext, float>;

 template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext &ctx,
+struct FindMovingAverageAbsMaxFunctor<phi::CPUContext, T> {
+  void operator()(const phi::CPUContext &ctx,
                  const framework::Tensor &in_accum,
                  const framework::Tensor &in_state,
                  const T *cur_scale,
@@ -382,8 +378,7 @@ struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
-                                               float>;
+template struct FindMovingAverageAbsMaxFunctor<phi::CPUContext, float>;

 class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
 public:
@@ -968,7 +963,7 @@ class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
+using CPU = phi::CPUContext;

 REGISTER_OPERATOR(
    fake_quantize_abs_max,

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -223,7 +223,6 @@ REGISTER_OPERATOR(
    ops::FCOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    fc,
-    ops::FCOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FCOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fc,
+                       ops::FCOpKernel<phi::CPUContext, float>,
+                       ops::FCOpKernel<phi::CPUContext, double>);
--- a/paddle/fluid/operators/fill_any_op.cc
+++ b/paddle/fluid/operators/fill_any_op.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
--- a/paddle/fluid/operators/frame_op.cc
+++ b/paddle/fluid/operators/frame_op.cc
--- a/paddle/fluid/operators/fsp_op.cc
+++ b/paddle/fluid/operators/fsp_op.cc
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
--- a/paddle/fluid/operators/graph_khop_sampler_op.cc
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cc
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
--- a/paddle/fluid/operators/lstsq_op.cc
+++ b/paddle/fluid/operators/lstsq_op.cc
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/math/squared_l2_norm.h
+++ b/paddle/fluid/operators/math/squared_l2_norm.h
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
--- a/paddle/fluid/operators/merge_selected_rows_op.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cc
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
--- a/paddle/fluid/operators/optimizers/ftrl_op.cc
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cc
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
--- a/paddle/fluid/operators/optimizers/merged_adam_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc
--- a/paddle/fluid/operators/overlap_add_op.cc
+++ b/paddle/fluid/operators/overlap_add_op.cc
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
--- a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
--- a/paddle/fluid/operators/pscore/switch_server_test.cc
+++ b/paddle/fluid/operators/pscore/switch_server_test.cc
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
--- a/paddle/fluid/operators/pyramid_hash_op.cc
+++ b/paddle/fluid/operators/pyramid_hash_op.cc
--- a/paddle/fluid/operators/qr_op.cc
+++ b/paddle/fluid/operators/qr_op.cc
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc
--- a/paddle/fluid/operators/repeat_interleave_op.cc
+++ b/paddle/fluid/operators/repeat_interleave_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
--- a/paddle/fluid/operators/seed_op.cc
+++ b/paddle/fluid/operators/seed_op.cc
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
--- a/paddle/fluid/operators/shuffle_channel_op.cc
+++ b/paddle/fluid/operators/shuffle_channel_op.cc
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
--- a/paddle/fluid/operators/solve_op.cc
+++ b/paddle/fluid/operators/solve_op.cc
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
--- a/paddle/fluid/operators/spectral_helper.h
+++ b/paddle/fluid/operators/spectral_helper.h
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
--- a/paddle/fluid/operators/spectral_op.cc
+++ b/paddle/fluid/operators/spectral_op.cc
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
--- a/paddle/fluid/operators/stft_op.cc
+++ b/paddle/fluid/operators/stft_op.cc
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/svd_op.cc
+++ b/paddle/fluid/operators/svd_op.cc
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ b/paddle/fluid/operators/tree_conv_op.cc
--- a/paddle/fluid/operators/unique_consecutive_op.cc
+++ b/paddle/fluid/operators/unique_consecutive_op.cc
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/fluid/platform/device/npu/npu_op_runner.cc
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
--- a/paddle/phi/kernels/funcs/math_function_impl.h
+++ b/paddle/phi/kernels/funcs/math_function_impl.h
--- a/paddle/phi/kernels/funcs/sequence2batch.cc
+++ b/paddle/phi/kernels/funcs/sequence2batch.cc
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h