unify gpu context (#44740)

* remove cudaDeviceContext * remove more template * fix rocm compile * remove alias name CUDADeviceContext * fix compile * fix tests * revert changes

unify gpu context (#44740)
* remove cudaDeviceContext * remove more template * fix rocm compile * remove alias name CUDADeviceContext * fix compile * fix tests * revert changes
86763023 · Leo Chen · GitHub · f15d930a · 86763023 · 86763023
373 changed file
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -94,7 +94,7 @@ class NPUEventManager {
      PADDLE_ENFORCE_EQ(device_index,
                        device_index_,
                        platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                            "Event's device %d",
                            device_index,
                            device_index_));

--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -104,7 +104,7 @@ class EventManager {
  bool DeviceId() const { return device_index_; }
  gpuEvent_t GetRawCudaEvent() const { return event_; }

-  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+  void Record(const phi::GPUContext& ctx) {
    auto device_index = ctx.GetPlace().device;
    if (!is_created_) {
      CreateEvent(device_index);
@@ -112,7 +112,7 @@ class EventManager {
    PADDLE_ENFORCE_EQ(device_index,
                      device_index_,
                      platform::errors::PreconditionNotMet(
-                          "CUDADeviceContext's device %d does not match"
+                          "phi::GPUContext's device %d does not match"
                          "Event's device %d",
                          device_index,
                          device_index_));
@@ -157,13 +157,13 @@ class EventManager {
    }
  }

-  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+  void Block(const phi::GPUContext& ctx) const {
    if (is_created_) {
      auto device_index = ctx.GetPlace().device;
      PADDLE_ENFORCE_EQ(device_index,
                        device_index_,
                        platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                            "Event's device %d",
                            device_index,
                            device_index_));

--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -31,10 +31,10 @@ namespace distributed {

 void SyncDefaultStream(
    const std::vector<Place>& places,
-    std::vector<EventManager>& ncclEvents,                       // NOLINT
-    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+    std::vector<EventManager>& ncclEvents,                     // NOLINT
+    std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) {  // NOLINT
  for (size_t i = 0; i < places.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
        platform::DeviceContextPool::Instance().Get(places[i]));
    ncclEvents[i].Record(*default_ctx);
    ncclEvents[i].Block(*dev_ctx[i]);
@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(

 void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
  for (size_t i = 0; i < places_.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
        platform::DeviceContextPool::Instance().Get(places_[i]));
    default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
  }
@@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
          << ", place: " << places_key
          << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);

-  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
  dev_ctx.resize(places.size());

  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
@@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
  for (size_t i = 0; i < places.size(); ++i) {
    platform::CUDADeviceGuard guard(places[i]);
    nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
-    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+    dev_ctx[i].reset(new phi::GPUContext(places[i]));
  }

  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());

--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -45,7 +45,6 @@ namespace paddle {
 namespace distributed {

 using Place = paddle::platform::Place;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;

 class ProcessGroupNCCL : public ProcessGroup {
 public:
@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {

  std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;

-  std::unordered_map<std::string,
-                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+  std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
      places_to_ctx_;

  std::set<int> used_place_ids_;

--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
 void EagerGroup::ConcatTensors(const platform::Place &place) {
  if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    ConcatTensorsWithType(
        *default_ctx, dense_tensors_, &dense_contents_, dtype_);
@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
 void EagerGroup::SplitTensors(const platform::Place &place) {
  if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    SplitTensorsWithType(
        *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
  auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
  if (platform::is_gpu_place(inner_place_)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    dev_ctx = static_cast<platform::CUDADeviceContext *>(
+    dev_ctx = static_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(inner_place_));
 #else
    PADDLE_THROW(platform::errors::PermissionDenied(

--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
    VLOG(3) << "Loading data for GPU.";
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx =
-        dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
    auto gpu_place = place;
    memory::Copy(gpu_place,
                 static_cast<void *>(input_tensor_ptr),

--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
    char* temp_ptr =
        new char[tensor->numel() *
                 framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(
        platform::CPUPlace(),
        temp_ptr,
@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
    char* temp_ptr =
        new char[tensor->numel() *
                 framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(
        platform::CPUPlace(),
        temp_ptr,
@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
                 framework::DataTypeSize(tensor->dtype())];     // NOLINT
    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);      // NOLINT
    io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(place,
                 tensor_data,
                 platform::CPUPlace(),
@@ -311,8 +308,7 @@ void DeserializeSelectedRows(
    unsigned long data_len;                                  // NOLINT
    io_buffer_itr.copy_and_forward((void*)(&data_len), 8);   // NOLINT
    io_buffer_itr.copy_and_forward(temp_ptr, data_len);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(place,
                 tensor_data,
                 platform::CPUPlace(),

--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
    std::vector<char> temp;
    temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
    char* temp_ptr = temp.data();
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(platform::CPUPlace(),
                 temp_ptr,
                 tensor->place(),

--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(expected_kernel_place));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
    if (!dev_ctx) {
      PADDLE_THROW(paddle::platform::errors::Fatal(
          "Cannot convert device_context to CUDADeviceContext."
          "This indicates backend mismatch."
          "Pleas double check your expected place"));
    }
-    ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
-        *dense_tensor.get(),
-        *dev_ctx,
-        scale,
-        bias,
-        bias_after_scale,
-        dense_out.get());
+    ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
+                                         *dev_ctx,
+                                         scale,
+                                         bias,
+                                         bias_after_scale,
+                                         dense_out.get());
 #endif
  } else {
    PADDLE_THROW(paddle::platform::errors::Fatal(

--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
    auto& place = dense_tensor->place();
    if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::framework::details::tensor_check<
-          paddle::platform::CUDADeviceContext>(
+      paddle::framework::details::tensor_check<phi::GPUContext>(
          api_name, tensor_name, *dense_tensor, place);
 #else
      PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {

    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
    auto stream = dev_ctx->stream();
    paddle::memory::Copy(place,
                         mutable_x,
@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {

    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
    auto stream = dev_ctx->stream();

    auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
  for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
    auto stream = dev_ctx->stream();

    std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);

--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
  if (place == paddle::platform::CUDAPlace()) {
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
    auto stream = dev_ctx->stream();

    paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
  if (place == paddle::platform::CUDAPlace()) {
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
    auto stream = dev_ctx->stream();

    paddle::memory::Copy(paddle::platform::CPUPlace(),

--- a/paddle/fluid/eager/tests/test_utils.h
+++ b/paddle/fluid/eager/tests/test_utils.h
@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
    auto stream = dev_ctx->stream();

    paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
    auto stream = dev_ctx->stream();

    paddle::memory::Copy(paddle::platform::CPUPlace(),

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
    paddle::framework::OpKernelTestProtoAndCheckerMaker);
 REGISTER_OP_CPU_KERNEL(test_op,
                       paddle::framework::TestKernel<phi::CPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(test_op,
+                        paddle::framework::TestKernel<phi::GPUContext, float>);

 static void BuildVar(const std::string& param_name,
                     std::initializer_list<const char*> arguments,

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
 MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
                                   const std::vector<UsedSlotInfo>& infos) {
  place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                platform::DeviceContextPool::Instance().Get(place))
                ->stream();

@@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}

 void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
  place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                platform::DeviceContextPool::Instance().Get(place))
                ->stream();
  ins_num_ = 0;

--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
    const int float_slot_size,
    const UsedSlotGpuType *used_slots) {
  auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
          paddle::platform::DeviceContextPool::Instance().Get(this->place_))
          ->stream();
  FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
@@ -168,7 +168,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
    const int float_slot_size,
    const UsedSlotGpuType *used_slots) {
  auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
          paddle::platform::DeviceContextPool::Instance().Get(this->place_))
          ->stream();


--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -103,8 +103,8 @@ struct CastDataType {
            CastDataTypeFunctor<InType, OutType>());
 #if defined(__NVCC__) || defined(__HIPCC__)
    } else if (platform::is_gpu_place(in_.place())) {
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      platform::Transform<phi::GPUContext> trans;
+      auto* context = static_cast<const phi::GPUContext*>(ctx_);
      trans(*context,
            in_begin,
            in_end,

--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 TEST(DataTypeTransform, GPUTransform) {
  auto cpu_place = paddle::platform::CPUPlace();
  auto gpu_place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
  context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(gpu_place, context.stream())
                           .get());

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new phi::GPUContext(p));
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
      gc_(gc) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place)) {
-    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+    dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
      platform::CUDADeviceGuard guard(place.device);

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
  GarbageCollector *gc_;                        // not own
  std::vector<Variable *> vars_;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  phi::GPUContext *dev_ctx_{nullptr};
  gpuEvent_t event_{nullptr};
 #endif
 };

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -58,7 +58,7 @@ struct TestGatherOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new phi::GPUContext(p));
      }
 #else
      PADDLE_THROW(

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,

  if (platform::is_gpu_place(tensor->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    tensor_check<platform::CUDADeviceContext>(
-        op_type, var_name, *tensor, place);
+    tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value,

 template <>
 template <typename T>
-void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
+void TensorCheckerVisitor<phi::GPUContext>::apply(
    typename std::enable_if<
        std::is_floating_point<T>::value ||
        std::is_same<T, ::paddle::platform::complex<float>>::value ||
@@ -143,7 +143,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
    const {
  int print_num = 3;

-  auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
      platform::DeviceContextPool::Instance().Get(tensor_.place()));
  int dev_id = tensor_.place().device;
  PADDLE_ENFORCE_EQ(
@@ -226,13 +226,13 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 }

 template <>
-void tensor_check<platform::CUDADeviceContext>(const std::string& op_type,
-                                               const std::string& var_name,
-                                               const framework::Tensor& tensor,
-                                               const platform::Place& place) {
+void tensor_check<phi::GPUContext>(const std::string& op_type,
+                                   const std::string& var_name,
+                                   const framework::Tensor& tensor,
+                                   const platform::Place& place) {
  std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap);

-  TensorCheckerVisitor<platform::CUDADeviceContext> vistor(
+  TensorCheckerVisitor<phi::GPUContext> vistor(
      op_type, var_name, tensor, place);
  VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
 }

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
      dev_ctx.second->Wait();
    }
  } else {
-    auto stream =
-        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
+    auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
    for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
@@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
        if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
          auto stream =
-              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
-                  ->stream();
+              static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
          PADDLE_ENFORCE_GPU_SUCCESS(
              hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
            platform::DeviceContextPool &pool =
                platform::DeviceContextPool::Instance();
            auto stream =
-                static_cast<platform::CUDADeviceContext *>(pool.Get(place))
-                    ->stream();
+                static_cast<phi::GPUContext *>(pool.Get(place))->stream();
            platform::GpuStreamSync(stream);
 #else
            PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
      if (in_var_handle) {
        if (platform::is_gpu_place(in_var_handle->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          auto stream = static_cast<platform::CUDADeviceContext *>(
+          auto stream = static_cast<phi::GPUContext *>(
                            dev_ctxes_.at(in_var_handle->place()))
                            ->stream();
 #ifdef PADDLE_WITH_HIP
@@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
  if (!events_.empty()) {  // Use event
    for (auto &p : dev_ctxes_) {
      auto dev_id = p.first.device;
-      auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
-      VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
+      auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
+      VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
      PADDLE_ENFORCE_GPU_SUCCESS(
          hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
    callback();
  } else {
    auto *ctx = dev_ctxes_.at(p);
-    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    auto *cuda_ctx = static_cast<phi::GPUContext *>(ctx);
    cuda_ctx->RecordEvent(events_.at(p.device), callback);
  }
 #else

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -69,7 +69,7 @@ struct TestReduceOpHandle {
      for (int i = 0; i < count; ++i) {
        auto p = p::CUDAPlace(i);
        gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new p::phi::GPUContext(p));
      }
      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -77,7 +77,7 @@ struct ScaleLossGradFunctor {
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      OutT cast_coeff = static_cast<OutT>(coeff_);
-      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
      memory::Copy(place_,
                   out_data,
                   platform::CPUPlace(),

--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                             const int hidden_size,
                             const int expand_embed_dim,
                             const int64_t total_length) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
@@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                          const int64_t* gpu_len,
                          int slot_num,
                          int total_len) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
 #ifdef PADDLE_WITH_HIP
@@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
                             const int expand_embed_dim,
                             const int64_t total_length,
                             const int batch_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto slot_lengths_lod = slot_lengths;

--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -223,10 +223,10 @@ class AfsManager {
    delete read_stream;
  }
  int PopenBidirectionalInternal(const char* command,
-                                 FILE*& fp_read,  // NOLINT
-                                 FILE*& fp_write,
-                                 pid_t& pid,  // NOLINT
-                                 bool read,   // NOLINT
+                                 FILE*& fp_read,   // NOLINT
+                                 FILE*& fp_write,  // NOLINT
+                                 pid_t& pid,       // NOLINT
+                                 bool read,        // NOLINT
                                 bool write) {
    std::lock_guard<std::mutex> g(g_flock);
    int fd_read[2];
@@ -440,10 +440,9 @@ class BoxWrapper {
      std::vector<gpuStream_t*> stream_list;
      for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
        VLOG(3) << "before get context i[" << i << "]";
-        platform::CUDADeviceContext* context =
-            dynamic_cast<platform::CUDADeviceContext*>(
-                platform::DeviceContextPool::Instance().Get(
-                    platform::CUDAPlace(i)));
+        phi::GPUContext* context = dynamic_cast<phi::GPUContext*>(
+            platform::DeviceContextPool::Instance().Get(
+                platform::CUDAPlace(i)));
        stream_list_[i] = context->stream();
        stream_list.push_back(&stream_list_[i]);
      }

--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -300,7 +300,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
    const int64_t total_length,
    int* gpu_dim,
    int feature_value_size) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    paddle::platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
@@ -333,7 +333,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
    size_t grad_value_size,
    std::vector<int>& slot_vector,
    std::vector<int>& slot_mf_dim_vector) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    paddle::platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  auto slot_lengths_lod = slot_lengths;

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                            const int64_t* gpu_len,
                            int slot_num,
                            int total_len) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                    platform::DeviceContextPool::Instance().Get(place))
                    ->stream();
  CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
    : GarbageCollector(place, max_memory_size) {}

 void DefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
+  static_cast<phi::GPUContext *>(this->dev_ctx_)->WaitStreamCallback();
 }

 void DefaultStreamGarbageCollector::ClearCallback(
    const std::function<void()> &callback) {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
+  static_cast<phi::GPUContext *>(this->dev_ctx_)->AddStreamCallback(callback);
 }

 StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,

--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope,
    char* temp_ptr = temp.data();
    float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
    temp_ptr_float[0] = micro_id;
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(*dev_ctx).stream();
    memory::Copy(
        place,
        tensor_data,

--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
    }
  }
 #ifdef PADDLE_WITH_CUDA
-  auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+  auto* dev_ctx = static_cast<phi::GPUContext*>(
      platform::DeviceContextPool::Instance().Get(place));
  PADDLE_ENFORCE_GPU_SUCCESS(
      cudaEventRecord(context->event_, dev_ctx->stream()));

--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name,
  device_code.SetWorkloadPerThread(1);
  device_code.Launch(n, &args);

-  auto* dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
      paddle::platform::DeviceContextPool::Instance().Get(place));
  dev_ctx->Wait();


--- a/paddle/fluid/framework/mixed_vector.cc
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
                     size_t *gpu_memory_size_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  // COPY GPU Data To CPU
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
  auto stream = dev_ctx->stream();
  void *src = (*gpu_)->ptr();
@@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
  (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
  void *dst = (*gpu_)->ptr();
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(place));
  auto stream = dev_ctx->stream();
  paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) {
 }

 gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
-  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+  return reinterpret_cast<const phi::GPUContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place))
      ->stream();
 }

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);

-  gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                           instr.DeviceContext())
-                           .stream();
+  gpuStream_t stream =
+      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {

--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
                       paddle::framework::OpKernelTest<phi::CPUContext, float>);

 REGISTER_OP_CUDA_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CUDADeviceContext,
-                                    float>);
+    op_with_kernel, paddle::framework::OpKernelTest<phi::GPUContext, float>);

 TEST(OperatorRegistrar, CPU) {
  paddle::framework::proto::OpDesc op_desc;
@@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) {
 }

 static int op_test_value = 0;
-using paddle::platform::CUDADeviceContext;
 using paddle::platform::DeviceContext;
 using phi::CPUContext;
+using phi::GPUContext;

 namespace paddle {
 namespace framework {
@@ -301,7 +299,7 @@ class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
 };

 template <typename T>
-class OpMultiKernelTest<CUDADeviceContext, T>
+class OpMultiKernelTest<phi::GPUContext, T>
    : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -325,7 +323,7 @@ class OpMultiKernelTest2<CPUContext, T>
 };

 template <typename T>
-class OpMultiKernelTest2<CUDADeviceContext, T>
+class OpMultiKernelTest2<phi::GPUContext, T>
    : public paddle::framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -351,12 +349,12 @@ REGISTER_OP_KERNEL(
    op_with_multi_kernel,
    CUDA,
    paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+    paddle::framework::OpMultiKernelTest<phi::GPUContext, float>);
 REGISTER_OP_KERNEL(
    op_with_multi_kernel,
    CUDNN,
    paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+    paddle::framework::OpMultiKernelTest2<phi::GPUContext, float>);

 TEST(OperatorRegistrar, OpWithMultiKernel) {
  paddle::framework::proto::OpDesc op_desc;

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -416,13 +416,12 @@ class ExecutionContext {
  }

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
+  const inline phi::GPUContext& cuda_device_context() const {
    PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                      true,
                      platform::errors::PreconditionNotMet(
                          "Current device context place is not GPUPlace."));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
+    return *reinterpret_cast<const phi::GPUContext*>(&device_context_);
  }
 #endif


--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices(
        nccl_ctxs->WaitAll();
      } else {
        auto src_place = member_->places_[0];
-        auto src_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+        auto src_dev_ctx = static_cast<phi::GPUContext *>(
            platform::DeviceContextPool::Instance().Get(src_place));
        auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
        for (size_t i = 1; i < member_->places_.size(); ++i) {
          auto dst_place = member_->places_[i];
-          auto dst_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          auto dst_dev_ctx = static_cast<phi::GPUContext *>(
              platform::DeviceContextPool::Instance().Get(dst_place));
          src_dev_ctx->Wait();
          dst_dev_ctx->Wait();
@@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
        global_scope, member_->places_);
    auto &pool = platform::DeviceContextPool::Instance();
    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
+      auto *dev_ctx =
+          static_cast<phi::GPUContext *>(pool.Get(member_->places_[dev_id]));
      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
      dev_ctx->set_nccl_comm(nccl_ctx.comm());
    }

--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-struct ConvertToPhiContext<platform::CUDADeviceContext> {
+struct ConvertToPhiContext<phi::GPUContext> {
  using TYPE = phi::GPUContext;
 };
 #endif

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src,
                          "place is %s, context place is %s.",
                          src_gpu_place,
                          ctx_gpu_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  }
  else if (platform::is_cpu_place(src_place) &&  // NOLINT
@@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src,
                          "destination place is %s, context place is %s.",
                          dst_gpu_place,
                          ctx_gpu_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
  }
  else if (platform::is_gpu_place(src_place) &&  // NOLINT
@@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src,
                          "device context GPU number is %d.",
                          src_gpu_place.device,
                          ctx_gpu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(
        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
  }
@@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src,
                          "device context GPU number is %d.",
                          dst_gpu_place.device,
                          ctx_gpu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    memory::Copy(
        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
  }
@@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src,
        platform::errors::PreconditionNotMet(
            "Context place error, excepted GPUPlace, but actually %s.",
            ctx_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
    if (platform::is_same_place(src_place, dst_place)) {
      memory::Copy(
          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
      platform::CPUPlace cpu;
      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
      while (size != 0) {
@@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
        platform::CUDAPlace(dl_tensor.device.device_id);
    dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_XPU

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -164,13 +164,12 @@ void TensorFromArray(const T* src,
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -242,13 +241,12 @@ void TensorFromVector(const std::vector<T>& src,
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
  }
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src,
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src.place(),
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)
@@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src,
  }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src.place(),
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
 #if defined(PADDLE_WITH_XPU)

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) {

    // CPU Tensor to GPU Tensor
    auto gpu_place = new platform::CUDAPlace(0);
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*gpu_place, gpu_ctx.stream())
                             .get());
@@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) {
    // Copy to GPUTensor
    gpu_tensor.Resize(phi::make_ddim({3, 3}));
    auto gpu_place = new paddle::platform::CUDAPlace();
-    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*gpu_place, gpu_ctx.stream())
                             .get());
@@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) {
    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    paddle::framework::Tensor gpu_tensor;
    paddle::platform::CUDAPlace place;
-    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    phi::GPUContext gpu_ctx(place);
    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(place, gpu_ctx.stream())
                             .get());
@@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) {
 #endif
 }

-TEST(TensorToVector, Tensor_bool) {
-{
-  paddle::framework::Tensor src;
-  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 3 * 3; ++i) {
-    src_ptr[i] = static_cast<bool>(i % 2);
-  }
+TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
+bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+for (int i = 0; i < 3 * 3; ++i) {
+  src_ptr[i] = static_cast<bool>(i % 2);
+}

-  paddle::platform::CPUPlace place;
-  std::vector<bool> dst;
-  paddle::framework::TensorToVector<bool>(src, &dst);
+paddle::platform::CPUPlace place;
+std::vector<bool> dst;
+paddle::framework::TensorToVector<bool>(src, &dst);

-  for (int i = 0; i < 3 * 3; ++i) {
-    EXPECT_EQ(src_ptr[i], dst[i]);
-  }
+for (int i = 0; i < 3 * 3; ++i) {
+  EXPECT_EQ(src_ptr[i], dst[i]);
 }
+}  // namespace framework

 #ifdef PADDLE_WITH_CUDA
 {
@@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) {
  };
  paddle::framework::Tensor gpu_tensor;
  paddle::platform::CUDAPlace place;
-  paddle::platform::CUDADeviceContext gpu_ctx(place);
+  phi::GPUContext gpu_ctx(place);
  gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(place, gpu_ctx.stream())
                           .get());
@@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) {
  }
 }
 #endif
-}
+}  // namespace paddle

 TEST(TensorFromDLPack, Tensor) {
  {
@@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) {
    Tensor dst_tensor;

    auto gpu_place = new platform::CUDAPlace();
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
    gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*gpu_place, gpu_ctx.stream())
                             .get());

--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src,

  auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
  auto nccl_dtype = platform::ToNCCLDataType(dtype);
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(place));

  bool use_calc_stream = (dev_ctx->stream() == stream);
@@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src,
               int ring_id,
               bool use_calc_stream) {
  const auto &place = GetVarPlace(src);
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(place));
  platform::NCCLComm *comm =
      platform::NCCLCommContext::Instance().Get(ring_id, place);

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -122,10 +122,9 @@ class TensorAddFunctor

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void operator()(const platform::CUDAPlace& place) const {
-    platform::CUDADeviceContext* ctx =
-        dynamic_cast<platform::CUDADeviceContext*>(
-            platform::DeviceContextPool::Instance().Get(place));
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    phi::GPUContext* ctx = dynamic_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }
 #else
@@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
  if (data_type == framework::proto::VarType::FP16) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
+      return TensorAddImpl<phi::GPUContext, platform::float16>(
          src_tensor, dst_tensor, place);
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
@@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
  if (data_type == framework::proto::VarType::BF16) {
    if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
+      return TensorAddImpl<phi::GPUContext, platform::bfloat16>(
          src_tensor, dst_tensor, place);
 #else
      PADDLE_THROW(platform::errors::Unimplemented(
@@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (paddle::platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
  } else {
 #endif
    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
@@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
  } else {
 #endif
    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
@@ -614,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (paddle::platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
  } else {
 #endif
    PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);

--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -85,7 +85,7 @@ void NCCLParallelContext::Init() {
    VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
            << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
            << " ring id: " << ring_id;
-    // it will assign nccl_comm in CUDADeviceContext within ring_id
+    // it will assign nccl_comm in phi::GPUContext within ring_id
    platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id],
                                                     strategy_.nranks_,
                                                     strategy_.local_rank_,
@@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
          << " ring id: " << ring_id;
-  // it will assign nccl_comm in CUDADeviceContext within ring_id
+  // it will assign nccl_comm in phi::GPUContext within ring_id
  platform::NCCLCommContext::Instance().CreateComm(
      &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);

@@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
                        ring_id,
                        compute_events_.size()));

-  auto compute_stream = static_cast<platform::CUDADeviceContext *>(
+  auto compute_stream = static_cast<phi::GPUContext *>(
                            platform::DeviceContextPool::Instance().Get(place_))
                            ->stream();
  auto comm_stream =
@@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
                        ring_id,
                        comm_events_.size()));

-  auto compute_stream = static_cast<platform::CUDADeviceContext *>(
+  auto compute_stream = static_cast<phi::GPUContext *>(
                            platform::DeviceContextPool::Instance().Get(place_))
                            ->stream();
  auto comm_stream =
@@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 }

 void NCCLParallelContext::SynchronizeCompute() {
-  auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *compute_dev_ctx = static_cast<phi::GPUContext *>(
      platform::DeviceContextPool::Instance().Get(place_));
  compute_dev_ctx->Wait();
 }

--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
  auto place = context.GetPlace();
  if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    ConcatTensorsWithType(
-        static_cast<const platform::CUDADeviceContext &>(context),
-        dense_tensors_,
-        &dense_contents_,
-        dtype_);
+    ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
+                          dense_tensors_,
+                          &dense_contents_,
+                          dtype_);
 #else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't concat grad tensors since it's not compiled with NCCL,"
@@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
  auto place = context.GetPlace();
  if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    SplitTensorsWithType(
-        static_cast<const platform::CUDADeviceContext &>(context),
-        &dense_contents_,
-        &dense_tensors_,
-        dtype_);
+    SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
+                         &dense_contents_,
+                         &dense_tensors_,
+                         dtype_);
 #else
    PADDLE_THROW(platform::errors::PermissionDenied(
        "Paddle can't split grad tensor since it's not compiled with NCCL,"

--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor,
        "Unsupport BF16 in DataParallel for now"));
  }
  framework::VisitDataTypeForHIP(
-      dtype_,
-      DivNRanksForAllReduce<platform::CUDADeviceContext>(
-          tensor, nranks, context));
+      dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
 #else
-  framework::VisitDataType(dtype_,
-                           DivNRanksForAllReduce<platform::CUDADeviceContext>(
-                               tensor, nranks, context));
+  framework::VisitDataType(
+      dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
 #endif
 }
 #endif

--- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 void AllReduceByStream(int local_rank, int device_id) {
  int data_size = 32;
  const auto& place = platform::CUDAPlace(device_id);
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);

  // heter_parallel_ctx
  imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);

--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) {
  int data_size = 4;
  float test_data = 7;
  const auto& place = platform::CUDAPlace(device_id);
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);

  imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);


--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
                          "Only one choice can be made between CPU and XPU."));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
    auto dst_gpu_place = place;
    memory::Copy(dst_gpu_place,
                 static_cast<void *>(input_ptr),
@@ -283,7 +282,7 @@ bool AnalysisPredictor::Init(
    // NOTE: If the external_stream equals to global_device_contexts's stream,
    // then fallback.
    auto global_stream =
-        static_cast<platform::CUDADeviceContext *>(
+        static_cast<phi::GPUContext *>(
            platform::DeviceContextPool::Instance().Get(place_))
            ->stream();
    if (predictor_stream_ != global_stream) {
@@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
    paddle::platform::DeviceContextPool &pool =
        paddle::platform::DeviceContextPool::Instance();
    auto gpu_place = place_;
-    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
-        pool.Get(gpu_place));
+    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(gpu_place));
 #ifdef PADDLE_WITH_HIP
    hipStreamSynchronize(dev_ctx->stream());
 #else
@@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
  auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
  paddle::platform::DeviceContextPool &pool =
      paddle::platform::DeviceContextPool::Instance();
-  auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
-      pool.Get(pred->place_));
+  auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
  cudaStreamSynchronize(dev_ctx->stream());
 #endif
 }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      platform::DeviceContextPool &pool =
          platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
+      auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
      auto dst_gpu_place = place_;
      memory::Copy(dst_gpu_place,
                   static_cast<void *>(input_ptr),

--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
    paddle::platform::DeviceContextPool& pool =
        paddle::platform::DeviceContextPool::Instance();
    paddle::platform::CUDAPlace gpu_place(dst.device_);
-    auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
-        pool.Get(gpu_place));
+    auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.Get(gpu_place));

    if (src.place() == PlaceType::kCPU) {
      paddle::memory::Copy(gpu_place,

--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place,
    } else if (platform::is_gpu_place(dst_place) &&
               platform::is_gpu_place(src_place)) {
      auto gpu_place = src_place;
-      memory::Copy(
-          gpu_place,
-          dst_data,
-          gpu_place,
-          src_data,
-          size,
-          static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+      memory::Copy(gpu_place,
+                   dst_data,
+                   gpu_place,
+                   src_data,
+                   size,
+                   static_cast<const phi::GPUContext&>(ctx).stream());
    }
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) {
  framework::Scope scope;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
  ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                       .GetAllocator(place, ctx.stream())
                       .get());

--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::GpuStreamSync(
-        static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    platform::GpuStreamSync(static_cast<const phi::GPUContext&>(ctx).stream());
  }
 #endif
  std::vector<float> result;

--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) {

 TEST(EngineIOConverterTester, DefaultGPU) {
  platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
  IOConverterTester(ctx);
 }


--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -124,7 +124,7 @@ class TRTConvertValidation {
  }

  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDADeviceContext ctx(place_);
+    phi::GPUContext ctx(place_);

    auto* x = scope_.Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
@@ -172,7 +172,7 @@ class TRTConvertValidation {
                          "But received batch_size:%d, max_batch_size_:%d",
                          batch_size,
                          max_batch_size_));
-    platform::CUDADeviceContext ctx(place_);
+    phi::GPUContext ctx(place_);
    op_->Run(scope_, place_);
    cudaStreamSynchronize(stream_);
    std::vector<std::string> input_output_names;

--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue(
    TransposeQKV(
        batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);

-    auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *device_ctx = static_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(
            platform::CUDAPlace(device_id)));

-    const platform::CUDADeviceContext &dev_ctx = *device_ctx;
+    const phi::GPUContext &dev_ctx = *device_ctx;
    operators::math::MultiHeadGPUComputeFunctor<float> multihead_compute_func;
    multihead_compute_func(dev_ctx,
                           batch,
@@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue(
    TransposeQKV(
        batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);

-    auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *device_ctx = static_cast<phi::GPUContext *>(
        platform::DeviceContextPool::Instance().Get(
            platform::CUDAPlace(device_id)));

@@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue(
    apply_scale<<<blocks, threads, 0, stream>>>(
        tptr, static_cast<half>(scale_), n_q);

-    const platform::CUDADeviceContext &dev_ctx = *device_ctx;
+    const phi::GPUContext &dev_ctx = *device_ctx;
    operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
    multihead_compute_func(dev_ctx,
                           batch,

--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -34,7 +34,7 @@ namespace tensorrt {
 class TensorRTDynamicEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                           .get());
@@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
  framework::Tensor input_;
  framework::Tensor output_;
  TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };

 TEST_F(TensorRTDynamicEngineTest, test_spmm) {
@@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
 class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
 protected:
  void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                           .get());
@@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
  std::vector<framework::Tensor> inputs_;
  std::vector<framework::Tensor> outputs_;
  TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };

 TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -26,7 +26,7 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                           .get());
@@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
  framework::Tensor input_;
  framework::Tensor output_;
  TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };

 TEST_F(TensorRTEngineTest, add_layer) {

--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
      std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));

  platform::CUDAPlace gpu(0);
-  platform::CUDADeviceContext dev_ctx(gpu);
+  phi::GPUContext dev_ctx(gpu);
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(gpu, dev_ctx.stream())
                           .get());
@@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
      size_t* data = reinterpret_cast<size_t*>(allocation->ptr());

      ForEachFill fill(data);
-      platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                                allocate_size);
+      platform::ForRange<phi::GPUContext> for_range(dev_ctx, allocate_size);
      for_range(fill);

      memory::Copy(platform::CPUPlace(),

--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -29,53 +29,51 @@ namespace memory {
 namespace allocation {

 /**
- * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
- * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
- * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
+ * GPUContextAllocation is a wrapper of the underbeneath allocation.
+ * GPUContextAllocation adds a CUDA stream callback for the underbeneath
+ * allocation so that GPUContextAllocation can be used in a CUDA stream
 * which deletes allocation in the callback.
 */
-class CUDADeviceContextAllocation : public Allocation {
+class GPUContextAllocation : public Allocation {
 public:
-  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
+  explicit GPUContextAllocation(DecoratedAllocationPtr allocation)
      : Allocation(allocation->ptr(),
                   allocation->base_ptr(),
                   allocation->size(),
                   allocation->place()),
        underlying_allocation_(std::move(allocation)) {}

-  ~CUDADeviceContextAllocation() {
+  ~GPUContextAllocation() {
    PADDLE_ENFORCE_NOT_NULL(
        dev_ctx_,
        platform::errors::PreconditionNotMet(
-            "Device context is not set for CUDADeviceContextAllocation"));
+            "Device context is not set for GPUContextAllocation"));
    auto *p_allocation = underlying_allocation_.release();
-    VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
+    VLOG(4) << "Adding callback to delete GPUContextAllocation at "
            << p_allocation;
    dev_ctx_->AddStreamCallback([p_allocation] {
-      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
+      VLOG(4) << "Delete GPUContextAllocation at " << p_allocation;
      Allocator::AllocationDeleter(p_allocation);
    });
  }

-  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
-    dev_ctx_ = dev_ctx;
-  }
+  void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; }

 private:
  DecoratedAllocationPtr underlying_allocation_;
-  const platform::CUDADeviceContext *dev_ctx_{nullptr};
+  const phi::GPUContext *dev_ctx_{nullptr};
 };

 /**
- * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
+ * GPUContextAllocator will allocate a GPUContextAllocation
 * after waiting for a self-created event on the default stream. It does so to
 * let the non-default stream be able to allocate GPU memory which will be
 * released by stream callback
 */
-class CUDADeviceContextAllocator : public Allocator {
+class GPUContextAllocator : public Allocator {
 public:
-  explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
-                                      gpuStream_t default_stream)
+  explicit GPUContextAllocator(platform::CUDAPlace place,
+                               gpuStream_t default_stream)
      : place_(place), default_stream_(default_stream) {
    platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
@@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator {
 #endif
  }

-  ~CUDADeviceContextAllocator() {
+  ~GPUContextAllocator() {
    if (event_) {
      platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
@@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator {
    PADDLE_ENFORCE_NOT_NULL(
        default_stream_,
        platform::errors::PreconditionNotMet(
-            "Default stream is not set for CUDADeviceContextAllocator"));
+            "Default stream is not set for GPUContextAllocator"));
    platform::CUDADeviceGuard guard(place_.device);
-    auto allocation = new CUDADeviceContextAllocation(
+    auto allocation = new GPUContextAllocation(
        static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
@@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator {
 };

 /**
- * CUDADeviceContextAllocatorPool is a singletion stores mapping from
- * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
- * CUDADeviceContext's compute stream isn't default stream, it can call this
+ * GPUContextAllocatorPool is a singletion stores mapping from
+ * CUDAPlace(s) to std::shared_ptr<GPUContextAllocator>. When a
+ * phi::GPUContext's compute stream isn't default stream, it can call this
 * class to allocate GPU memory which will be released by a callback after
 * stream execution.
 */
-class CUDADeviceContextAllocatorPool {
+class GPUContextAllocatorPool {
 public:
-  static CUDADeviceContextAllocatorPool &Instance() {
-    static CUDADeviceContextAllocatorPool pool;
+  static GPUContextAllocatorPool &Instance() {
+    static GPUContextAllocatorPool pool;
    return pool;
  }

-  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
+  AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) {
    auto iter =
        allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
    PADDLE_ENFORCE_NE(
@@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool {
        platform::errors::NotFound("No allocator found for CUDAPlace."));
    auto &allocator = iter->second;
    AllocationPtr allocation = allocator->Allocate(size);
-    static_cast<CUDADeviceContextAllocation *>(allocation.get())
-        ->SetCUDADeviceContext(&dev_ctx);
+    static_cast<GPUContextAllocation *>(allocation.get())
+        ->SetGPUContext(&dev_ctx);
    return allocation;
  }

 private:
-  CUDADeviceContextAllocatorPool() {
+  GPUContextAllocatorPool() {
    std::vector<int> devices = platform::GetSelectedDevices();
    for (int i : devices) {
      auto place = platform::CUDAPlace(i);
      auto compute_stream =
          platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
-      auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
-          new CUDADeviceContextAllocator(place, compute_stream));
+      auto allocator = std::shared_ptr<GPUContextAllocator>(
+          new GPUContextAllocator(place, compute_stream));
      allocators_.insert(make_pair(place, allocator));
    }
  }

-  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
+  std::map<platform::CUDAPlace, std::shared_ptr<GPUContextAllocator>>
      allocators_;
 };


--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -37,7 +37,7 @@ const int NUM_STREAMS = 8;
 const int N = 2;
 const float DELTA = 1e-1;

-using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>;
+using CudaDevCtxVec = std::vector<std::unique_ptr<phi::GPUContext>>;

 __global__ void kernel(float *x, int n) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) {

 void MultiStreamCompute(float **data,
                        float **second_data,
-                        const platform::CUDADeviceContext &ctx) {
+                        const phi::GPUContext &ctx) {
  // multi-streams
  AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
  EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
@@ -88,7 +88,7 @@ void MultiStreamCompute(float **data,
 #endif
 }

-TEST(Malloc, CUDADeviceContextMultiStream) {
+TEST(Malloc, GPUContextMultiStream) {
  auto place = platform::CUDAPlace(0);
  platform::SetDeviceId(0);

@@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place));
+    auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                          .GetAllocator(place, ctx->stream())
                          .get());
@@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
  }
 }

-TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
+TEST(Malloc, GPUContextMultiThreadMultiStream) {
  auto place = platform::CUDAPlace(0);
  platform::SetDeviceId(0);

@@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
  main_stream_alloc_ptr.reset();

  for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place));
+    auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
    ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                          .GetAllocator(place, ctx->stream())
                          .get());

--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
  allocation_implicit_stream.reset();

  gpuStream_t default_stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
          paddle::platform::DeviceContextPool::Instance().Get(place))
          ->stream();
  allocation::AllocationPtr allocation_unique =
@@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
  size_t alloc_size = 256;

  gpuStream_t default_stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
          paddle::platform::DeviceContextPool::Instance().Get(place))
          ->stream();
  std::shared_ptr<Allocation> allocation_implicit_stream =

--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -19,8 +19,8 @@
 namespace paddle {
 namespace operators {
 using framework::Tensor;
+using phi::GPUContext;
 using platform::ActivationDescriptor;
-using platform::CUDADeviceContext;
 using platform::TensorDescriptor;

 #ifdef PADDLE_WITH_HIP
@@ -39,12 +39,12 @@ template <typename T>
 struct CudnnActivationFunctor {
  using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
-  CudnnActivationFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationFunctor(const phi::GPUContext& ctx,
                         const T& c,
                         const miopenActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #else
-  CudnnActivationFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationFunctor(const phi::GPUContext& ctx,
                         const T& c,
                         const cudnnActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
@@ -77,7 +77,7 @@ struct CudnnActivationFunctor {
        out->mutable_data<T>(ctx_.GetPlace())));
 #endif
  }
-  const CUDADeviceContext& ctx_;
+  const phi::GPUContext& ctx_;
  const T coef_;
 #ifdef PADDLE_WITH_HIP
  const miopenActivationMode_t mode_;
@@ -90,12 +90,12 @@ template <typename T>
 struct CudnnActivationGradFunctor {
  using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
-  CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationGradFunctor(const phi::GPUContext& ctx,
                             const T& c,
                             const miopenActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
 #else
-  CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationGradFunctor(const phi::GPUContext& ctx,
                             const T& c,
                             const cudnnActivationMode_t& m)
      : ctx_(ctx), coef_(c), mode_(m) {}
@@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor {
        dx->mutable_data<T>(ctx_.GetPlace())));
 #endif
  }
-  const CUDADeviceContext& ctx_;
+  const phi::GPUContext& ctx_;
  const T coef_;
 #ifdef PADDLE_WITH_HIP
  const miopenActivationMode_t mode_;
@@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor {

 template <typename T>
 struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnReluFunctor(const phi::GPUContext& ctx)
      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 };
 template <typename T>
 struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnReluGradFunctor(const phi::GPUContext& ctx)
      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}

  static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {

 template <typename T>
 struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
-  explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
+  explicit CudnnRelu6Functor(const phi::GPUContext& ctx)
      : CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 };
 template <typename T>
 struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx)
      : CudnnActivationGradFunctor<T>(
            ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}

@@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {

 template <typename T>
 struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx)
      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 };
 template <typename T>
 struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx)
      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}

  static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {

 template <typename T>
 struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnTanhFunctor(const phi::GPUContext& ctx)
      : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 };
 template <typename T>
 struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx)
      : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}

  static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -221,7 +221,7 @@ class CudnnActivationKernel
    framework::Tensor* Out = nullptr;
    ExtractActivationTensor(context, &X, &Out);
    Out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
    Functor functor(dev_ctx);
    functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out);
  }
@@ -242,7 +242,7 @@ class CudnnActivationGradKernel
    ExtractActivationGradTensor<Functor::FwdDeps()>(
        context, &X, &Out, &dOut, &dX);
    dX->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
    Functor functor(dev_ctx);
    functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"),
            GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"),

--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor =
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-#define REGISTER_ACTIVATION_CUDA_KERNEL(                               \
-    act_type, op_name, functor, grad_functor)                          \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type,                                                        \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<float>>,                  \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<double>>,                 \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::float16>>,          \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::bfloat16>>);        \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type##_grad,                                                 \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<float>>,         \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<double>>,        \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<plat::float16>>, \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(                                       \
+    act_type, op_name, functor, grad_functor)                                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type,                                                                \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
+      ops::ActivationCudaKernel<phi::GPUContext,                               \
+                                ops::functor<plat::bfloat16>>);                \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<plat::bfloat16>>);

-#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                           \
-    act_type, op_name, functor, grad_functor)                          \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type,                                                        \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<float>>,                  \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<double>>,                 \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<int>>,                    \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<int64_t>>,                \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::float16>>,          \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::bfloat16>>);        \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type##_grad,                                                 \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<float>>,         \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<double>>,        \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<int>>,           \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<int64_t>>,       \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<plat::float16>>, \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                                   \
+    act_type, op_name, functor, grad_functor)                                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type,                                                                \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int>>,           \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int64_t>>,       \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
+      ops::ActivationCudaKernel<phi::GPUContext,                               \
+                                ops::functor<plat::bfloat16>>);                \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<phi::GPUContext, ops::grad_functor<int>>,  \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<int64_t>>,               \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<plat::bfloat16>>);

 REGISTER_OP_CUDA_KERNEL(
    relu6,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<double>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<int>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<int64_t>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<float>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<double>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int64_t>>,
+    ops::ActivationCudaKernel<phi::GPUContext,
                              ops::CudaRelu6Functor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationCudaKernel<phi::GPUContext,
                              ops::CudaRelu6Functor<plat::bfloat16>>);
 REGISTER_OP_CUDA_KERNEL(
    relu6_grad,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<int>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<int64_t>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                  ops::CudaRelu6GradFunctor<plat::bfloat16>>);

 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \

--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;

 REGISTER_OP_CUDA_KERNEL(affine_channel,
                        ops::AffineChannelCUDAKernel<CUDA, float>,

--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
        platform::errors::InvalidArgument(
            "Only support for CUDAPlace.Please switch your context from "
            "CPUPlace to CUDAPlace or update your cudnn."));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
    auto handle = dev_ctx.cudnn_handle();
    auto* theta = ctx.Input<Tensor>("Theta");
    auto* output = ctx.Output<Tensor>("Output");
@@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
                          "support for CUDAPlace. Please switch "
                          "your context from CPUPlace to "
                          "CUDAPlace or update your cudnn."));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
    auto handle = dev_ctx.cudnn_handle();
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
    auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));

--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
 }

 template <typename T>
-struct Linspace<paddle::platform::CUDADeviceContext, T> {
+struct Linspace<phi::GPUContext, T> {
  void operator()(T start,
                  T end,
                  int count,
@@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
      w = size_attr[3];
    }
    T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    phi::funcs::SetConstant<phi::GPUContext, T>()(
        ctx.cuda_device_context(), theta_grad, static_cast<T>(0));

    T h_step;

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {

 public:
  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
    const auto xs = ctx.MultiInput<framework::Tensor>("X");
    const auto* scale = ctx.Input<framework::Tensor>("Scale");
    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
@@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());

    framework::Tensor inverse_scale =
-        ctx.AllocateTmpTensor<MPDType, platform::CUDADeviceContext>({1},
-                                                                    dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, phi::GPUContext>({1}, dev_ctx);
    MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();

    InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(

--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs,
 }

 template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<platform::CUDADeviceContext,
-                               T,
-                               IsFoundInfOnCPU> {
+class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
 public:
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                  const bool* found_inf_data,
                  const T* pre_loss_scaling_data,
                  const int* good_in_data,
@@ -134,9 +132,9 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext,
 };

 template <typename T>
-class LazyZeros<platform::CUDADeviceContext, T> {
+class LazyZeros<phi::GPUContext, T> {
 public:
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                  const bool* found_inf_data,
                  const std::vector<const framework::Tensor*>& xs,
                  const std::vector<framework::Tensor*>& outs) const {
@@ -204,7 +202,7 @@ class LazyZeros<platform::CUDADeviceContext, T> {

 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using GPU = paddle::platform::CUDADeviceContext;
+using GPU = phi::GPUContext;

 REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
                        ops::UpdateLossScalingKernel<GPU, float>,

--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
      Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
    } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
+      Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
      PADDLE_THROW(
          platform::errors::Unavailable("Paddle is not compiled with CUDA."));

--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
@@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> {
          *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len);
      cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
    }
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
    framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
    auto out_data = out->mutable_data<T>(out_dims, place);


--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
    T* out_data = output->mutable_data<T>(ctx.GetPlace());
    // initialize
    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto& place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
    out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));

    CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
    int64_t strideA = ins_num * in_dim;
    int64_t strideB = in_dim * out_dim;

-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
    blas.BatchedGEMM(transA,
                     transB,
                     ins_num,
@@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
    auto in_dim = input_dims[2];
    auto out_dim = w_dims[2];

-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto& place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
    // initialize
    dx->mutable_data<T>(ctx.GetPlace());
    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
@@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
                     out_dim,
                     db_data);

-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
    T alpha = 1;
    T beta = 0;

@@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
+using GPUCtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(batch_fc,
                        ops::BatchFCCUDAKernel<GPUCtx, float>,
                        ops::BatchFCCUDAKernel<GPUCtx, double>);

--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -17,9 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(beam_search,
+                        ops::BeamSearchOpKernel<phi::GPUContext, float>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, double>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, int>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, int64_t>);
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_OP_CUDA_KERNEL(transfer_dtype,
                        ops::CastOpKernel<CUDA, float>,

--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
+using GPUCtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(center_loss,
                        ops::CenterLossCUDAKernel<GPUCtx, float>,
                        ops::CenterLossCUDAKernel<GPUCtx, double>);

--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"

 namespace ops = paddle::operators;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(
    cinn_instruction_run,
-    ops::CinnInstructionRunOpKernel<CUDADeviceContext, float>);
+    ops::CinnInstructionRunOpKernel<phi::GPUContext, float>);
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -18,6 +18,4 @@ limitations under the License. */

 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(
-    cinn_launch,
-    paddle::operators::CinnLaunchOpKernel<paddle::platform::CUDADeviceContext,
-                                          float>);
+    cinn_launch, paddle::operators::CinnLaunchOpKernel<phi::GPUContext, float>);
--- a/paddle/fluid/operators/cinn/cinn_op_helper.cc
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc
@@ -21,10 +21,8 @@ namespace paddle::operators::details {

 #ifdef PADDLE_WITH_CUDA
 template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx) {
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
+void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx) {
+  const auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
  return dev_ctx.stream();
 }
 #endif

--- a/paddle/fluid/operators/cinn/cinn_op_helper.h
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.h
@@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) {

 #ifdef PADDLE_WITH_CUDA
 template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx);
+void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx);
 #endif

 }  // namespace details

--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
            platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace());
        // use global calculate stream
        const auto calcu_stream =
-            static_cast<platform::CUDADeviceContext*>(
+            static_cast<phi::GPUContext*>(
                platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
                ->stream();
        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
@@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    class_center_sample,
-    ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext,
-                                     int64_t>,
-    ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int64_t>,
+    ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int>);
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
    coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::CoalesceTensorOpKernel<phi::GPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, float>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, double>);
 #endif

 #if defined(PADDLE_WITH_ASCEND_CL)

--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
@@ -17,10 +17,9 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;

-REGISTER_OP_CUDA_KERNEL(
-    allreduce,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(allreduce,
+                        ops::AllReduceOpKernel<phi::GPUContext, float>,
+                        ops::AllReduceOpKernel<phi::GPUContext, double>,
+                        ops::AllReduceOpKernel<phi::GPUContext, int>,
+                        ops::AllReduceOpKernel<phi::GPUContext, int64_t>,
+                        ops::AllReduceOpKernel<phi::GPUContext, plat::float16>);
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
                      platform::errors::PreconditionNotMet(
                          "AllReduce op can run on gpu place only for now."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");


--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
    int rid = ctx.Attr<int>("ring_id");
    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    auto stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    ncclRedOp_t nccl_red_type = ncclSum;
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));

--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                             "only be an In-Place operation."));

-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
    auto comm = dev_ctx.nccl_comm();
    auto stream = dev_ctx.stream();


--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
      T* recv_buff = temp_out.data<T>();
      gpuStream_t stream = nullptr;
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();

      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::ncclAllGather(send_buff,
@@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
      offset += rows_per_tensor;
    }

-    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    math::ConcatFunctor<phi::GPUContext, T> functor;
    out->mutable_data<T>(out_dims, place);
-    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx2 = ctx.template device_context<phi::GPUContext>();
    functor(dev_ctx2, inputs, axis, out);
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
    auto *ids_t = context.Input<LoDTensor>("Ids");
    auto *output_t = context.Output<LoDTensor>("Out");

-    const auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto &dev_ctx = context.template device_context<phi::GPUContext>();
    const int64_t start_idx = context.Attr<int64_t>("start_index");
    size_t N = table_t->dims()[0];
    size_t D = table_t->dims()[1];
@@ -142,8 +141,7 @@ template <typename T>
 class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto &dev_ctx = context.template device_context<phi::GPUContext>();
    const int64_t start_idx = context.Attr<int64_t>("start_index");
    auto ids_t = context.Input<LoDTensor>("Ids");
    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
    gpuStream_t stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }

--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {

    const auto& place = ctx.GetPlace();
    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();

    // use global calculate stream
-    const auto stream = static_cast<platform::CUDADeviceContext*>(
+    const auto stream = static_cast<phi::GPUContext*>(
                            platform::DeviceContextPool::Instance().Get(place))
                            ->stream();

@@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {

    // step 1, obtain logit_max
    Tensor logits_max;
-    logits_max =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* logits_max_buff = logits_max.mutable_data<T>(place);

    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
@@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    // step 3, obtain predict target
    Tensor predicted_logits;
    predicted_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    predicted_logits.mutable_data<T>(place);

    auto t = framework::EigenVector<T>::Flatten(predicted_logits);
@@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {

    // step 5, obtain sum_exp_logits
    Tensor sum_exp_logits;
-    sum_exp_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
@@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    const int rank = ctx.Attr<int>("rank");

    const auto& place = ctx.GetPlace();
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();

    auto map = distributed::ProcessGroupMapFromGid::getInstance();
    distributed::ProcessGroup* pg = map->get(rid);
@@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {

    // step 1, obtain logit_max
    Tensor logits_max;
-    logits_max =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);

    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
    Eigen::DSizes<int, 1> along_axis(1);
@@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    // step 3, obtain predict target
    Tensor predicted_logits;
    predicted_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    predicted_logits.mutable_data<T>(place);

    auto t = framework::EigenVector<T>::Flatten(predicted_logits);
@@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {

    // step 5, obtain sum_exp_logits
    Tensor sum_exp_logits;
-    sum_exp_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
@@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Logits"));
    const Tensor* softmax = context.Input<Tensor>("Softmax");
    const int rank = context.Attr<int>("rank");
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();

    if (logit_grad != softmax) {
      framework::TensorCopy(

--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
--- a/paddle/fluid/operators/dequantize_abs_max_op.cu
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cu
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
--- a/paddle/fluid/operators/detection/iou_similarity_op.cu
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cu
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
--- a/paddle/fluid/operators/detection/target_assign_op.cu
+++ b/paddle/fluid/operators/detection/target_assign_op.cu
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
--- a/paddle/fluid/operators/dgc_op.cu
+++ b/paddle/fluid/operators/dgc_op.cu
--- a/paddle/fluid/operators/diag_op.cu
+++ b/paddle/fluid/operators/diag_op.cu
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
--- a/paddle/fluid/operators/eigvalsh_op.cu
+++ b/paddle/fluid/operators/eigvalsh_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
--- a/paddle/fluid/operators/fc_op.cu.cc
+++ b/paddle/fluid/operators/fc_op.cu.cc
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
--- a/paddle/fluid/operators/fill_any_op.cu.cc
+++ b/paddle/fluid/operators/fill_any_op.cu.cc
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
--- a/paddle/fluid/operators/fill_diagonal_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_op.cu
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
--- a/paddle/fluid/operators/fold_op.cu
+++ b/paddle/fluid/operators/fold_op.cu
--- a/paddle/fluid/operators/fsp_op.cu
+++ b/paddle/fluid/operators/fsp_op.cu
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
--- a/paddle/fluid/operators/fused/attn_feed_forward.h
+++ b/paddle/fluid/operators/fused/attn_feed_forward.h
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
--- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cu
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
--- a/paddle/fluid/operators/load_combine_op.cu
+++ b/paddle/fluid/operators/load_combine_op.cu
--- a/paddle/fluid/operators/load_op.cu
+++ b/paddle/fluid/operators/load_op.cu
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cu
--- a/paddle/fluid/operators/optimizers/lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/lamb_op.cu
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
--- a/paddle/fluid/operators/random_crop_op.cu
+++ b/paddle/fluid/operators/random_crop_op.cu
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
--- a/paddle/fluid/operators/repeat_interleave_op.cu
+++ b/paddle/fluid/operators/repeat_interleave_op.cu
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
--- a/paddle/fluid/operators/run_program_op.cu.cc
+++ b/paddle/fluid/operators/run_program_op.cu.cc
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
--- a/paddle/fluid/operators/save_op.cu
+++ b/paddle/fluid/operators/save_op.cu
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ b/paddle/fluid/operators/space_to_depth_op.cu
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
--- a/paddle/fluid/operators/spectral_op.cu.h
+++ b/paddle/fluid/operators/spectral_op.cu.h
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ b/paddle/fluid/operators/spp_op.cu.cc
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
--- a/paddle/fluid/operators/stft_op.cu
+++ b/paddle/fluid/operators/stft_op.cu
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
--- a/paddle/fluid/operators/tensor_to_string.h
+++ b/paddle/fluid/operators/tensor_to_string.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/tree_conv_op.cu
+++ b/paddle/fluid/operators/tree_conv_op.cu
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
--- a/paddle/fluid/platform/device_context_test_cuda_graph.cu
+++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
--- a/paddle/phi/kernels/funcs/gru_compute.cu
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
--- a/paddle/phi/kernels/funcs/lstm_compute.cu
+++ b/paddle/phi/kernels/funcs/lstm_compute.cu
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
--- a/paddle/phi/kernels/funcs/sequence2batch.cu
+++ b/paddle/phi/kernels/funcs/sequence2batch.cu
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h