From 8676302364924bb190dcf171da7cf30d290aa2a6 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 1 Aug 2022 19:40:30 +0800
Subject: [PATCH] unify gpu context (#44740)

* remove cudaDeviceContext

* remove more template

* fix rocm compile

* remove alias name CUDADeviceContext

* fix compile

* fix tests

* revert changes
---
 .../fluid/distributed/collective/HCCLTools.h  |   2 +-
 .../fluid/distributed/collective/NCCLTools.h  |   8 +-
 .../collective/ProcessGroupNCCL.cc            |  12 +-
 .../distributed/collective/ProcessGroupNCCL.h |   4 +-
 .../fluid/distributed/collective/reducer.cc   |   6 +-
 .../distributed/fleet_executor/dist_model.cc  |   3 +-
 .../distributed/ps/service/brpc_utils.cc      |  12 +-
 .../distributed/ps/service/heter_client.cc    |   3 +-
 .../eager_generated/backwards/scale_node.cc   |  17 +-
 paddle/fluid/eager/nan_inf_utils.cc           |   3 +-
 .../performance_tests/benchmark_fluid_cuda.cc |   9 +-
 .../performance_tests/benchmark_utils.cc      |   6 +-
 paddle/fluid/eager/tests/test_utils.h         |   8 +-
 .../framework/data_device_transform_test.cu   |   5 +-
 paddle/fluid/framework/data_feed.cc           |   4 +-
 paddle/fluid/framework/data_feed.cu           |   4 +-
 paddle/fluid/framework/data_type_transform.cc |   4 +-
 .../framework/data_type_transform_test.cu     |   2 +-
 .../details/broadcast_op_handle_test.h        |   2 +-
 .../details/eager_deletion_op_handle.cc       |   2 +-
 .../details/eager_deletion_op_handle.h        |   2 +-
 .../details/gather_op_handle_test.cc          |   2 +-
 .../framework/details/nan_inf_utils_detail.cc |   3 +-
 .../framework/details/nan_inf_utils_detail.cu |  14 +-
 .../fluid/framework/details/op_handle_base.cc |  17 +-
 .../details/reduce_op_handle_test.cc          |   2 +-
 .../details/scale_loss_grad_op_handle.cc      |   2 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   6 +-
 paddle/fluid/framework/fleet/box_wrapper.h    |  15 +-
 .../framework/fleet/heter_ps/feature_value.cu |   4 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   |   2 +-
 paddle/fluid/framework/garbage_collector.cc   |   6 +-
 .../fluid/framework/heter_section_worker.cc   |   3 +-
 paddle/fluid/framework/heterxpu_trainer.cc    |   2 +-
 .../ir/fusion_group/code_generator_tester.cc  |   2 +-
 paddle/fluid/framework/mixed_vector.cc        |   4 +-
 paddle/fluid/framework/mixed_vector_test.cu   |   2 +-
 .../framework/new_executor/interpretercore.cc |   5 +-
 paddle/fluid/framework/op_registry_test.cc    |  14 +-
 paddle/fluid/framework/operator.h             |   5 +-
 paddle/fluid/framework/parallel_executor.cc   |   8 +-
 paddle/fluid/framework/phi_utils.h            |   2 +-
 paddle/fluid/framework/tensor_util.cc         |  31 ++--
 paddle/fluid/framework/tensor_util.h          |  65 ++++----
 paddle/fluid/framework/tensor_util_test.cc    |  36 ++--
 paddle/fluid/imperative/all_reduce.cc         |   4 +-
 .../fluid/imperative/gradient_accumulator.cc  |  23 ++-
 paddle/fluid/imperative/nccl_context.cc       |  10 +-
 paddle/fluid/imperative/reducer.cc            |  18 +-
 paddle/fluid/imperative/reducer.cu            |   9 +-
 .../tests/heter_ccl_context_test.cc           |   2 +-
 .../imperative/tests/nccl_context_test.cc     |   2 +-
 .../fluid/inference/api/analysis_predictor.cc |  11 +-
 paddle/fluid/inference/api/api_impl.cc        |   3 +-
 .../inference/api/paddle_infer_contrib.cc     |   3 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |  13 +-
 .../fluid/inference/lite/test_engine_lite.cc  |   2 +-
 .../fluid/inference/lite/test_tensor_utils.cc |   3 +-
 .../tensorrt/convert/test_io_converter.cc     |   2 +-
 .../inference/tensorrt/convert/ut_helper.h    |   4 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |   8 +-
 .../inference/tensorrt/test_dynamic_engine.cc |   8 +-
 .../fluid/inference/tensorrt/test_engine.cc   |   4 +-
 .../allocation/best_fit_allocator_test.cu     |   5 +-
 .../cuda_device_context_allocator.h           |  64 ++++----
 paddle/fluid/memory/malloc_test.cu            |  14 +-
 .../memory/stream_safe_cuda_alloc_test.cu     |   4 +-
 .../fluid/operators/activation_cudnn_op.cu.cc |  34 ++--
 paddle/fluid/operators/activation_op.kps      | 119 ++++++--------
 paddle/fluid/operators/affine_channel_op.cu   |   2 +-
 .../operators/affine_grid_cudnn_op.cu.cc      |   4 +-
 paddle/fluid/operators/affine_grid_op.cu      |   4 +-
 .../amp/check_finite_and_unscale_op.cu        |   5 +-
 .../operators/amp/update_loss_scaling_op.cu   |  12 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/assign_pos_op.cu       |   3 +-
 paddle/fluid/operators/batch_fc_op.cu         |  18 +-
 paddle/fluid/operators/beam_search_op.cu.cc   |  11 +-
 paddle/fluid/operators/cast_op.cu             |   2 +-
 paddle/fluid/operators/center_loss_op.cu      |   2 +-
 .../cinn/cinn_instruction_run_op.cu.cc        |   3 +-
 .../fluid/operators/cinn/cinn_launch_op.cu.cc |   4 +-
 paddle/fluid/operators/cinn/cinn_op_helper.cc |   6 +-
 paddle/fluid/operators/cinn/cinn_op_helper.h  |   3 +-
 .../fluid/operators/class_center_sample_op.cu |   7 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   9 +-
 .../operators/collective/allreduce_op.cu.cc   |  13 +-
 .../fluid/operators/collective/allreduce_op.h |   2 +-
 .../operators/collective/alltoall_op.cu.cc    |   2 +-
 .../operators/collective/barrier_op.cu.cc     |   2 +-
 .../operators/collective/broadcast_op.cu.cc   |   2 +-
 .../operators/collective/c_allgather_op.cu.cc |   2 +-
 .../operators/collective/c_allreduce_op.h     |   2 +-
 .../operators/collective/c_broadcast_op.cu.cc |   2 +-
 .../operators/collective/c_concat_op.cu.cc    |   6 +-
 .../operators/collective/c_embedding_op.cu    |   6 +-
 .../fluid/operators/collective/c_reduce_op.h  |   2 +-
 .../collective/c_reducescatter_op.cu.cc       |   2 +-
 .../operators/collective/c_scatter_op.cu.cc   |   2 +-
 .../c_softmax_with_cross_entropy_op.cu        |  25 ++-
 .../fluid/operators/collective/c_split_op.cu  |   2 +-
 .../collective/c_sync_calc_stream_op.h        |   2 +-
 .../operators/collective/c_wait_comm_op.cc    |   2 +-
 .../operators/collective/c_wait_compute_op.cc |   2 +-
 .../collective/global_gather_op.cu.cc         |   2 +-
 .../collective/global_scatter_op.cu.cc        |   2 +-
 .../collective/partial_allgather_op.cu.cc     |   2 +-
 .../collective/partial_recv_op.cu.cc          |   2 +-
 .../collective/partial_send_op.cu.cc          |   2 +-
 .../operators/collective/recv_v2_op.cu.cc     |   2 +-
 .../operators/collective/send_v2_op.cu.cc     |   2 +-
 paddle/fluid/operators/conv_shift_op.cu       |  24 +--
 paddle/fluid/operators/conv_transpose_op.cc   |   6 +-
 .../fluid/operators/copy_cross_scope_test.cc  |   4 +-
 paddle/fluid/operators/correlation_op.cu      |  14 +-
 paddle/fluid/operators/cos_sim_op.cu          |   8 +-
 paddle/fluid/operators/crop_op.cc             |  14 +-
 paddle/fluid/operators/cross_entropy_op.cu    |   2 +-
 paddle/fluid/operators/ctc_align_op.cu        |   7 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |  18 +-
 paddle/fluid/operators/cvm_op.cu              |   6 +-
 paddle/fluid/operators/data_norm_op.cu        |  26 ++-
 .../operators/deformable_psroi_pooling_op.cu  |   2 +-
 .../fluid/operators/dequantize_abs_max_op.cu  |  10 +-
 paddle/fluid/operators/dequantize_log_op.cu   |   8 +-
 .../fluid/operators/detail/strided_memcpy.h   |   6 +-
 .../detection/anchor_generator_op.cu          |   3 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   6 +-
 .../fluid/operators/detection/box_clip_op.cu  |   7 +-
 .../detection/box_decoder_and_assign_op.cu    |   6 +-
 .../detection/collect_fpn_proposals_op.cu     |  14 +-
 .../detection/density_prior_box_op.cu         |   3 +-
 .../detection/distribute_fpn_proposals_op.cu  |  10 +-
 .../detection/generate_proposals_op.cu        |   9 +-
 .../detection/generate_proposals_v2_op.cu     |   9 +-
 .../operators/detection/iou_similarity_op.cu  |   7 +-
 .../fluid/operators/detection/prior_box_op.cu |   3 +-
 .../detection/roi_perspective_transform_op.cu |   4 +-
 .../detection/sigmoid_focal_loss_op.cu        |  11 +-
 .../operators/detection/target_assign_op.cu   |  17 +-
 paddle/fluid/operators/dgc_clip_by_norm_op.cu |   5 +-
 paddle/fluid/operators/dgc_op.cu              |   3 +-
 paddle/fluid/operators/diag_op.cu             |  11 +-
 .../operators/dlnne/dlnne_engine_op_test.cc   |   6 +-
 paddle/fluid/operators/dropout_op_test.cc     |   2 +-
 paddle/fluid/operators/edit_distance_op.cu    |   9 +-
 paddle/fluid/operators/eigvalsh_op.cu         |  28 ++--
 .../elementwise/elementwise_op_function.h     |   4 +-
 paddle/fluid/operators/expand_as_op.cc        |  24 ++-
 paddle/fluid/operators/expand_op.cc           |  24 ++-
 paddle/fluid/operators/fake_dequantize_op.cu  |   2 +-
 .../fluid/operators/fake_dequantize_op.cu.h   |  16 +-
 paddle/fluid/operators/fake_quantize_op.cu    |   2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |  51 +++---
 paddle/fluid/operators/fc_op.cu.cc            |   9 +-
 paddle/fluid/operators/feed_forward_test.cu   |  12 +-
 paddle/fluid/operators/fill_any_op.cu.cc      |  26 ++-
 paddle/fluid/operators/fill_constant_op.h     |   4 +-
 paddle/fluid/operators/fill_diagonal_op.cu    |   1 -
 .../operators/fill_diagonal_tensor_op.cu      |   6 +-
 .../fluid/operators/fill_zeros_like_op.cu.cc  |  36 ++--
 paddle/fluid/operators/flatten_op.cu.cc       |  60 ++++---
 paddle/fluid/operators/fold_op.cu             |  14 +-
 paddle/fluid/operators/fsp_op.cu              |   8 +-
 .../operators/fused/attention_layer_norm.h    |   4 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |  11 +-
 .../fluid/operators/fused/attn_feed_forward.h |   8 +-
 paddle/fluid/operators/fused/attn_gemm.h      |   8 +-
 .../fluid/operators/fused/conv_fusion_op.cu   |  12 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc |  30 ++--
 .../fused/cudnn_bn_stats_finalize.cu.h        |   8 +-
 .../operators/fused/cudnn_norm_conv.cu.h      |  22 +--
 .../operators/fused/cudnn_norm_conv_test.cc   |  32 ++--
 .../fused/cudnn_scale_bias_add_relu.cu.h      |  10 +-
 paddle/fluid/operators/fused/fmha_ref.h       |   8 +-
 .../operators/fused/fused_attention_op.cu     |   2 +-
 .../operators/fused/fused_bn_activation_op.cu |  24 ++-
 .../fused/fused_bn_add_activation_op.cu       |  13 +-
 .../operators/fused/fused_dropout_act_bias.h  |   4 +-
 .../fused/fused_dropout_act_bias_test.cu      |   6 +-
 .../operators/fused/fused_dropout_common.h    |   6 +-
 .../operators/fused/fused_dropout_helper.h    |  25 ++-
 .../operators/fused/fused_dropout_test.h      |   6 +-
 .../fused/fused_elemwise_activation_op.cu     |  32 ++--
 .../fused_embedding_eltwise_layernorm_op.cu   |   3 +-
 .../fused_fc_elementwise_layernorm_op.cu      |   4 +-
 .../operators/fused/fused_feedforward_op.cu   |  24 ++-
 .../operators/fused/fused_gate_attention.h    |  14 +-
 .../fused/fused_gate_attention_op.cu          |   4 +-
 .../operators/fused/fused_gemm_epilogue_op.cu |  19 +--
 .../fused_layernorm_residual_dropout_bias.h   |   4 +-
 ...ed_layernorm_residual_dropout_bias_test.cu |   6 +-
 .../fused/fused_multi_transformer_op.cu       |   6 +-
 .../fused/fused_residual_dropout_bias.h       |   4 +-
 .../fused/fused_residual_dropout_bias_test.cu |   6 +-
 .../operators/fused/fused_seqpool_cvm_op.cu   |  10 +-
 .../fused/fusion_conv_inception_op.cu         |   2 +-
 .../operators/fused/fusion_group_op.cu.cc     |   9 +-
 .../fusion_transpose_flatten_concat_op.cu.cc  |   2 +-
 .../operators/fused/multihead_matmul_op.cu    |   7 +-
 .../fluid/operators/fused/resnet_unit_op.cu   |   4 +-
 .../operators/fused/skip_layernorm_op.cu      |   5 +-
 .../fluid/operators/fused/yolo_box_head_op.cu |   3 +-
 .../fluid/operators/fused_softmax_mask_op.cu  |   8 +-
 .../fused_softmax_mask_upper_triangle_op.cu   |  11 +-
 .../fluid/operators/gather_scatter_kernel.cu  |   6 +-
 paddle/fluid/operators/gaussian_random_op.cu  |   3 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  23 +--
 .../operators/grid_sampler_cudnn_op.cu.cc     |   4 +-
 paddle/fluid/operators/group_norm_op.cu       |  28 ++--
 paddle/fluid/operators/gru_op.cu.cc           |  14 +-
 paddle/fluid/operators/gru_unit_op.cu         |  14 +-
 paddle/fluid/operators/hinge_loss_op.cc       |  10 +-
 paddle/fluid/operators/im2sequence_op.cc      |  10 +-
 paddle/fluid/operators/inplace_abn_op.cu      |  18 +-
 paddle/fluid/operators/interpolate_op.cu      |  12 +-
 paddle/fluid/operators/isfinite_op.cu         |  57 ++-----
 paddle/fluid/operators/l1_norm_op.cc          |   8 +-
 .../fluid/operators/limit_by_capacity_op.cu   |   3 +-
 paddle/fluid/operators/lite/lite_engine_op.h  |   4 +-
 .../operators/lite/lite_engine_op_test.cc     |   2 +-
 paddle/fluid/operators/lite/ut_helper.h       |   2 +-
 paddle/fluid/operators/load_combine_op.cu     |  13 +-
 paddle/fluid/operators/load_op.cu             |  13 +-
 paddle/fluid/operators/lod_reset_op.cu        |  22 ++-
 .../fluid/operators/lod_tensor_to_array_op.cc |   2 +-
 paddle/fluid/operators/lookup_table_op.cu     |   3 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   3 +-
 paddle/fluid/operators/lrn_op.cu              |  22 ++-
 paddle/fluid/operators/lstm_op.cu.cc          |  14 +-
 paddle/fluid/operators/lstmp_op.cu            |  14 +-
 paddle/fluid/operators/lstsq_op.cu            |  67 ++++----
 .../operators/margin_cross_entropy_op.cu      |  17 +-
 paddle/fluid/operators/margin_rank_loss_op.cu |  10 +-
 paddle/fluid/operators/marker_op.cu           |   2 +-
 paddle/fluid/operators/math/beam_search.cu    |  12 +-
 .../fluid/operators/math/beam_search_test.cc  |  12 +-
 .../operators/math/bert_encoder_functor.cu    |  31 ++--
 .../operators/math/bert_encoder_functor.h     |   2 +-
 .../fluid/operators/math/concat_and_split.cu  |  14 +-
 paddle/fluid/operators/math/concat_test.cc    |  21 +--
 .../fluid/operators/math/context_project.cu   |   4 +-
 .../fluid/operators/math/cos_sim_functor.cu   |   8 +-
 .../operators/math/eigen_values_vectors.h     |  51 +++---
 paddle/fluid/operators/math/gru_compute.cu    |  20 +--
 paddle/fluid/operators/math/im2col_test.cc    |  16 +-
 paddle/fluid/operators/math/sample_prob.cu    |  17 +-
 paddle/fluid/operators/math/sample_prob.h     |   2 +-
 .../operators/math/selected_rows_functor.cu   |  29 ++--
 .../math/selected_rows_functor_test.cu.cc     |  45 ++---
 .../operators/math/sequence_padding_test.cc   |   8 +-
 .../fluid/operators/math/sequence_pooling.cu  |  16 +-
 .../operators/math/sequence_pooling_test.cc   |   8 +-
 paddle/fluid/operators/math/tree2col.cu       |  20 +--
 paddle/fluid/operators/math/unpooling.cu      |  32 ++--
 paddle/fluid/operators/math/vol2col_test.cc   |  16 +-
 paddle/fluid/operators/matmul_op.cc           |  21 +--
 paddle/fluid/operators/mean_iou_op.cu         |   2 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |   3 +-
 .../operators/merge_selected_rows_op.cu.cc    |   7 +-
 paddle/fluid/operators/minus_op.cc            |   3 +-
 .../fluid/operators/modified_huber_loss_op.cu |   5 +-
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |   6 +-
 paddle/fluid/operators/number_count_op.cu     |   3 +-
 paddle/fluid/operators/one_hot_op.cu          |   7 +-
 .../operators/optimizers/cast_with_ptr.h      |   4 +-
 .../optimizers/decayed_adagrad_op.cu          |   5 +-
 .../operators/optimizers/dgc_momentum_op.cu   |   5 +-
 .../distributed_fused_lamb_init_op.cu         |  19 +--
 .../optimizers/distributed_fused_lamb_op.cu   |  51 +++---
 paddle/fluid/operators/optimizers/ftrl_op.cu  |   3 +-
 paddle/fluid/operators/optimizers/lamb_op.cu  |   7 +-
 .../operators/optimizers/lars_momentum_op.cu  |  51 +++---
 .../pow2_decay_with_linear_warmup_op.cu       |   4 +-
 .../optimizers/proximal_adagrad_op.cu         |   5 +-
 .../operators/optimizers/proximal_gd_op.cu    |   5 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |   3 +-
 .../optimizers/sparse_momentum_op.cu          |   7 +-
 paddle/fluid/operators/pad2d_op.cu            |   4 +-
 .../fluid/operators/pad_constant_like_op.cc   |  21 +--
 paddle/fluid/operators/partial_concat_op.cu   |   8 +-
 paddle/fluid/operators/partial_sum_op.cu      |   8 +-
 paddle/fluid/operators/prroi_pool_op.cu       |   7 +-
 .../operators/prune_gate_by_capacity_op.cu    |   2 +-
 .../pscore/distributed_lookup_table_op.cu.cc  |   2 +-
 .../pscore/distributed_push_sparse_op.cu.cc   |   4 +-
 .../operators/pscore/send_and_recv_op.cc      |  11 +-
 .../pscore/send_and_recv_op_gpu_test.cc       |   5 +-
 paddle/fluid/operators/py_layer_op.cc         |  30 ++--
 paddle/fluid/operators/qr_op.cu               | 155 +++++++++---------
 paddle/fluid/operators/quantize_linear_op.cu  |  10 +-
 paddle/fluid/operators/random_crop_op.cu      |   2 +-
 paddle/fluid/operators/random_crop_op.h       |   2 +-
 paddle/fluid/operators/random_routing_op.cu   |   3 +-
 paddle/fluid/operators/rank_attention_op.cu   |  18 +-
 paddle/fluid/operators/rank_loss_op.cc        |   7 +-
 .../fluid/operators/reader/buffered_reader.cc |   4 +-
 .../fluid/operators/reduce_ops/reduce_op.cu.h |   2 +-
 paddle/fluid/operators/renorm_op.cu           |   3 +-
 .../fluid/operators/repeat_interleave_op.cu   |  33 ++--
 paddle/fluid/operators/reshape_op.cc          |   6 +-
 paddle/fluid/operators/row_conv_op.cu         |  16 +-
 paddle/fluid/operators/run_program_op.cu.cc   |  10 +-
 paddle/fluid/operators/sample_logits_op.cu    |   4 +-
 paddle/fluid/operators/save_combine_op.cu     |  11 +-
 paddle/fluid/operators/save_op.cu             |  15 +-
 paddle/fluid/operators/seed_op.cu             |   5 +-
 .../sequence_ops/sequence_concat_op.cu.cc     |  24 +--
 .../sequence_ops/sequence_conv_op.cu.cc       |  14 +-
 .../sequence_ops/sequence_expand_as_op.cu     |  29 ++--
 .../sequence_ops/sequence_expand_op.cu        |  30 ++--
 .../sequence_ops/sequence_mask_op.cu          |  12 +-
 .../operators/sequence_ops/sequence_pad_op.cu |  22 ++-
 .../sequence_ops/sequence_pool_op.cu          |  10 +-
 .../sequence_ops/sequence_reshape_op.cu       |  20 +--
 .../sequence_ops/sequence_reverse_op.cu       |  13 +-
 .../sequence_ops/sequence_slice_op.cu         |  20 +--
 .../sequence_softmax_cudnn_op.cu.cc           |  10 +-
 .../sequence_ops/sequence_softmax_op.cc       |   6 +-
 .../sequence_ops/sequence_softmax_op.cu       |  20 +--
 .../sequence_ops/sequence_unpad_op.cu         |  20 +--
 paddle/fluid/operators/shuffle_batch_op.cu    |  11 +-
 paddle/fluid/operators/shuffle_channel_op.cu  |  11 +-
 paddle/fluid/operators/slice_op.cc            |  44 ++---
 paddle/fluid/operators/smooth_l1_loss_op.cu   |  10 +-
 paddle/fluid/operators/space_to_depth_op.cu   |  22 ++-
 paddle/fluid/operators/sparse_attention_op.cu |  16 +-
 paddle/fluid/operators/spectral_op.cu         |  42 ++---
 paddle/fluid/operators/spectral_op.cu.h       |  32 ++--
 paddle/fluid/operators/spp_op.cu.cc           |  14 +-
 .../fluid/operators/squared_l2_distance_op.cu |   8 +-
 paddle/fluid/operators/squeeze_op.cu.cc       |  48 +++---
 paddle/fluid/operators/stft_op.cu             |  14 +-
 paddle/fluid/operators/strided_memcpy.h       |   3 +-
 paddle/fluid/operators/strided_memcpy_test.cc |   4 +-
 paddle/fluid/operators/sum_op.cu              |  34 ++--
 paddle/fluid/operators/tensor_to_string.h     |   2 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   3 +-
 .../tensorrt/tensorrt_engine_op_test.cc       |   6 +-
 .../test_leaky_relu_grad_grad_functor.h       |   5 +-
 paddle/fluid/operators/top_k_function_cuda.h  |   2 +-
 paddle/fluid/operators/top_k_op.cu            |  28 ++--
 paddle/fluid/operators/tree_conv_op.cu        |  14 +-
 .../operators/uniform_random_inplace_op.cu    |   5 +-
 paddle/fluid/operators/uniform_random_op.h    |   3 +-
 paddle/fluid/operators/unpool_op.cu.cc        |  28 ++--
 paddle/fluid/operators/unsqueeze_op.cu.cc     |  53 +++---
 paddle/fluid/platform/bfloat16_test.cu        |   2 +-
 paddle/fluid/platform/collective_helper.cc    |  12 +-
 paddle/fluid/platform/collective_helper.h     |   2 +-
 .../platform/cuda_graph_with_memory_pool.cc   |   6 +-
 .../platform/device/gpu/cuda/cudnn_helper.h   |   2 +-
 .../platform/device/gpu/gpu_launch_config.h   |  20 +--
 .../fluid/platform/device/gpu/nccl_helper.h   |  10 +-
 .../platform/device/gpu/rocm/miopen_helper.h  |   2 +-
 paddle/fluid/platform/device_code.cc          |   6 +-
 paddle/fluid/platform/device_context.cc       |  13 +-
 paddle/fluid/platform/device_context.h        |   8 +-
 paddle/fluid/platform/device_context_test.cu  |  12 +-
 .../device_context_test_cuda_graph.cu         |   2 +-
 paddle/fluid/platform/device_event_gpu.cc     |  10 +-
 paddle/fluid/platform/device_event_test.cc    |   6 +-
 paddle/fluid/platform/float16_test.cu         |   2 +-
 paddle/fluid/platform/transform_test.cu       |  10 +-
 paddle/fluid/pybind/pybind.cc                 |   2 +-
 .../kernels/funcs/detail/lstm_gpu_kernel.h    |   8 +-
 paddle/phi/kernels/funcs/gru_compute.cu       |  22 ++-
 paddle/phi/kernels/funcs/lstm_compute.cu      |  18 +-
 paddle/phi/kernels/funcs/math_function.cu     |  95 ++++++-----
 paddle/phi/kernels/funcs/sequence2batch.cu    |  22 +--
 paddle/phi/kernels/gpu/top_k_kernel.cu        |  10 +-
 .../phi/tests/kernels/test_math_function.cu   |  24 +--
 .../tests/custom_op/custom_raw_op_kernel_op.h |   2 +-
 373 files changed, 1976 insertions(+), 2483 deletions(-)

diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
index 6eb169d8fff..89ce00fe874 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -94,7 +94,7 @@ class NPUEventManager {
       PADDLE_ENFORCE_EQ(device_index,
                         device_index_,
                         platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                             "Event's device %d",
                             device_index,
                             device_index_));
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index 197761dc3c3..c00b081438c 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -104,7 +104,7 @@ class EventManager {
   bool DeviceId() const { return device_index_; }
   gpuEvent_t GetRawCudaEvent() const { return event_; }
 
-  void Record(const paddle::platform::CUDADeviceContext& ctx) {
+  void Record(const phi::GPUContext& ctx) {
     auto device_index = ctx.GetPlace().device;
     if (!is_created_) {
       CreateEvent(device_index);
@@ -112,7 +112,7 @@ class EventManager {
     PADDLE_ENFORCE_EQ(device_index,
                       device_index_,
                       platform::errors::PreconditionNotMet(
-                          "CUDADeviceContext's device %d does not match"
+                          "phi::GPUContext's device %d does not match"
                           "Event's device %d",
                           device_index,
                           device_index_));
@@ -157,13 +157,13 @@ class EventManager {
     }
   }
 
-  void Block(const paddle::platform::CUDADeviceContext& ctx) const {
+  void Block(const phi::GPUContext& ctx) const {
     if (is_created_) {
       auto device_index = ctx.GetPlace().device;
       PADDLE_ENFORCE_EQ(device_index,
                         device_index_,
                         platform::errors::PreconditionNotMet(
-                            "CUDADeviceContext's device %d does not match"
+                            "phi::GPUContext's device %d does not match"
                             "Event's device %d",
                             device_index,
                             device_index_));
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 81db9b94da9..d776f62373e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -31,10 +31,10 @@ namespace distributed {
 
 void SyncDefaultStream(
     const std::vector<Place>& places,
-    std::vector<EventManager>& ncclEvents,                       // NOLINT
-    std::vector<std::unique_ptr<CUDADeviceContext>>& dev_ctx) {  // NOLINT
+    std::vector<EventManager>& ncclEvents,                     // NOLINT
+    std::vector<std::unique_ptr<phi::GPUContext>>& dev_ctx) {  // NOLINT
   for (size_t i = 0; i < places.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(places[i]));
     ncclEvents[i].Record(*default_ctx);
     ncclEvents[i].Block(*dev_ctx[i]);
@@ -69,7 +69,7 @@ void ProcessGroupNCCL::NCCLTask::SetOutputs(
 
 void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() {
   for (size_t i = 0; i < places_.size(); ++i) {
-    auto* default_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(places_[i]));
     default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent());
   }
@@ -201,7 +201,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
           << ", place: " << places_key
           << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id);
 
-  std::vector<std::unique_ptr<CUDADeviceContext>> dev_ctx;
+  std::vector<std::unique_ptr<phi::GPUContext>> dev_ctx;
   dev_ctx.resize(places.size());
 
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
@@ -209,7 +209,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
   for (size_t i = 0; i < places.size(); ++i) {
     platform::CUDADeviceGuard guard(places[i]);
     nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id);
-    dev_ctx[i].reset(new CUDADeviceContext(places[i]));
+    dev_ctx[i].reset(new phi::GPUContext(places[i]));
   }
 
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 4dd44771d15..5adb6867eb8 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -45,7 +45,6 @@ namespace paddle {
 namespace distributed {
 
 using Place = paddle::platform::Place;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 class ProcessGroupNCCL : public ProcessGroup {
  public:
@@ -174,8 +173,7 @@ class ProcessGroupNCCL : public ProcessGroup {
 
   std::unordered_map<std::string, std::vector<EventManager>> places_to_events_;
 
-  std::unordered_map<std::string,
-                     std::vector<std::unique_ptr<CUDADeviceContext>>>
+  std::unordered_map<std::string, std::vector<std::unique_ptr<phi::GPUContext>>>
       places_to_ctx_;
 
   std::set<int> used_place_ids_;
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 5f137c4d0af..8f4466f7baa 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -241,7 +241,7 @@ static void SplitTensorsWithType(const DeviceContext &context,
 void EagerGroup::ConcatTensors(const platform::Place &place) {
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     ConcatTensorsWithType(
         *default_ctx, dense_tensors_, &dense_contents_, dtype_);
@@ -264,7 +264,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
 void EagerGroup::SplitTensors(const platform::Place &place) {
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto *default_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     SplitTensorsWithType(
         *default_ctx, &dense_contents_, &dense_tensors_, dtype_);
@@ -883,7 +883,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   auto *dev_ctx = platform::DeviceContextPool::Instance().Get(inner_place_);
   if (platform::is_gpu_place(inner_place_)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    dev_ctx = static_cast<platform::CUDADeviceContext *>(
+    dev_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(inner_place_));
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index e641d6311c6..0b46369b970 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -78,8 +78,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
     VLOG(3) << "Loading data for GPU.";
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx =
-        dynamic_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
     memory::Copy(gpu_place,
                  static_cast<void *>(input_tensor_ptr),
diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc
index 3ed4277c61e..b98e85f9c23 100644
--- a/paddle/fluid/distributed/ps/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc
@@ -119,8 +119,7 @@ void SerializeLodTensor(framework::Variable* var,
     char* temp_ptr =
         new char[tensor->numel() *
                  framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         platform::CPUPlace(),
         temp_ptr,
@@ -168,8 +167,7 @@ void SerializeSelectedRows(framework::Variable* var,
     char* temp_ptr =
         new char[tensor->numel() *
                  framework::DataTypeSize(tensor->dtype())];  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         platform::CPUPlace(),
         temp_ptr,
@@ -265,8 +263,7 @@ void DeserializeLodTensor(framework::Variable* var,
                  framework::DataTypeSize(tensor->dtype())];     // NOLINT
     io_buffer_itr.copy_and_forward((void*)(&data_len), 8);      // NOLINT
     io_buffer_itr.copy_and_forward((void*)temp_ptr, data_len);  // NOLINT
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(place,
                  tensor_data,
                  platform::CPUPlace(),
@@ -311,8 +308,7 @@ void DeserializeSelectedRows(
     unsigned long data_len;                                  // NOLINT
     io_buffer_itr.copy_and_forward((void*)(&data_len), 8);   // NOLINT
     io_buffer_itr.copy_and_forward(temp_ptr, data_len);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(place,
                  tensor_data,
                  platform::CPUPlace(),
diff --git a/paddle/fluid/distributed/ps/service/heter_client.cc b/paddle/fluid/distributed/ps/service/heter_client.cc
index 91a20a432a3..84ef0b02bed 100644
--- a/paddle/fluid/distributed/ps/service/heter_client.cc
+++ b/paddle/fluid/distributed/ps/service/heter_client.cc
@@ -43,8 +43,7 @@ int GetMicroId(const platform::DeviceContext& ctx,
     std::vector<char> temp;
     temp.resize(tensor->numel() * framework::DataTypeSize(tensor->dtype()));
     char* temp_ptr = temp.data();
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(platform::CPUPlace(),
                  temp_ptr,
                  tensor->place(),
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 1409119daf1..002b8330763 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -134,21 +134,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (expected_kernel_place == paddle::platform::CUDAPlace()) {
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(expected_kernel_place));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(expected_kernel_place));
     if (!dev_ctx) {
       PADDLE_THROW(paddle::platform::errors::Fatal(
           "Cannot convert device_context to CUDADeviceContext."
           "This indicates backend mismatch."
           "Pleas double check your expected place"));
     }
-    ScaleDeviceDispatch<paddle::platform::CUDADeviceContext>(
-        *dense_tensor.get(),
-        *dev_ctx,
-        scale,
-        bias,
-        bias_after_scale,
-        dense_out.get());
+    ScaleDeviceDispatch<phi::GPUContext>(*dense_tensor.get(),
+                                         *dev_ctx,
+                                         scale,
+                                         bias,
+                                         bias_after_scale,
+                                         dense_out.get());
 #endif
   } else {
     PADDLE_THROW(paddle::platform::errors::Fatal(
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 6b2b9c9f34a..f8c06a5afff 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -38,8 +38,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      paddle::framework::details::tensor_check<
-          paddle::platform::CUDADeviceContext>(
+      paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
       PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index f275e3f0bf1..6441ce1e788 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -66,8 +66,7 @@ TEST(Benchmark, FluidScaleCUDA) {
 
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
     paddle::memory::Copy(place,
                          mutable_x,
@@ -121,8 +120,7 @@ TEST(Benchmark, FluidMatmulCUDA) {
 
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     auto* x_tensor = X->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -181,8 +179,7 @@ TEST(Benchmark, FluidMLPCUDA) {
   for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     std::vector<float> x_src_data(MLP_M * MLP_N, MLP_X_VAL);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 05ab86028da..b41938d4856 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -171,8 +171,7 @@ static void FluidCheckTensorValue(const std::shared_ptr<imperative::VarBase>& X,
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -204,8 +203,7 @@ static void FluidCheckGradTensorValue(
   if (place == paddle::platform::CUDAPlace()) {
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx =
-        dynamic_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+    auto* dev_ctx = dynamic_cast<phi::GPUContext*>(pool.Get(place));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
diff --git a/paddle/fluid/eager/tests/test_utils.h b/paddle/fluid/eager/tests/test_utils.h
index 8540fc7e10d..0e62e5c2da6 100644
--- a/paddle/fluid/eager/tests/test_utils.h
+++ b/paddle/fluid/eager/tests/test_utils.h
@@ -40,8 +40,8 @@ bool CompareGradTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
@@ -79,8 +79,8 @@ bool CompareTensorWithValue(const paddle::experimental::Tensor& target,
 #ifdef PADDLE_WITH_CUDA
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
-    auto* dev_ctx = dynamic_cast<paddle::platform::CUDADeviceContext*>(
-        pool.Get(paddle::platform::CUDAPlace()));
+    auto* dev_ctx =
+        dynamic_cast<phi::GPUContext*>(pool.Get(paddle::platform::CUDAPlace()));
     auto stream = dev_ctx->stream();
 
     paddle::memory::Copy(paddle::platform::CPUPlace(),
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index 94e7918e800..cd76747c035 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -92,9 +92,8 @@ REGISTER_OP_WITHOUT_GRADIENT(
     paddle::framework::OpKernelTestProtoAndCheckerMaker);
 REGISTER_OP_CPU_KERNEL(test_op,
                        paddle::framework::TestKernel<phi::CPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    test_op,
-    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(test_op,
+                        paddle::framework::TestKernel<phi::GPUContext, float>);
 
 static void BuildVar(const std::string& param_name,
                      std::initializer_list<const char*> arguments,
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 8ffb58f9451..4b5177aaa45 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -2809,7 +2809,7 @@ void SlotRecordInMemoryDataFeed::BuildSlotBatchGPU(const int ins_num) {
 MiniBatchGpuPack::MiniBatchGpuPack(const paddle::platform::Place& place,
                                    const std::vector<UsedSlotInfo>& infos) {
   place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                 platform::DeviceContextPool::Instance().Get(place))
                 ->stream();
 
@@ -2843,7 +2843,7 @@ MiniBatchGpuPack::~MiniBatchGpuPack() {}
 
 void MiniBatchGpuPack::reset(const paddle::platform::Place& place) {
   place_ = place;
-  stream_ = dynamic_cast<platform::CUDADeviceContext*>(
+  stream_ = dynamic_cast<phi::GPUContext*>(
                 platform::DeviceContextPool::Instance().Get(place))
                 ->stream();
   ins_num_ = 0;
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index d144673d62d..681fb1fdb29 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -89,7 +89,7 @@ void SlotRecordInMemoryDataFeed::FillSlotValueOffset(
     const int float_slot_size,
     const UsedSlotGpuType *used_slots) {
   auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(this->place_))
           ->stream();
   FillSlotValueOffsetKernel<<<GET_BLOCKS(used_slot_num),
@@ -168,7 +168,7 @@ void SlotRecordInMemoryDataFeed::CopyForTensor(
     const int float_slot_size,
     const UsedSlotGpuType *used_slots) {
   auto stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(this->place_))
           ->stream();
 
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 9333e246c68..59d20306c66 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -103,8 +103,8 @@ struct CastDataType {
             CastDataTypeFunctor<InType, OutType>());
 #if defined(__NVCC__) || defined(__HIPCC__)
     } else if (platform::is_gpu_place(in_.place())) {
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
+      platform::Transform<phi::GPUContext> trans;
+      auto* context = static_cast<const phi::GPUContext*>(ctx_);
       trans(*context,
             in_begin,
             in_end,
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index ed5b7fc692b..8490afd69d9 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 TEST(DataTypeTransform, GPUTransform) {
   auto cpu_place = paddle::platform::CPUPlace();
   auto gpu_place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h
index 26ad71bafe6..154bf2b354e 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -105,7 +105,7 @@ struct TestBroadcastOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CUDAPlace(i);
         place_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new phi::GPUContext(p));
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
 #else
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index dafeb5cdb26..1e384143a3c 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -46,7 +46,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       gc_(gc) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
-    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+    dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(place.device);
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index a30e80b204d..0a92269c50a 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -81,7 +81,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
 };
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
index 9cc1929e19a..45d8939f788 100644
--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -58,7 +58,7 @@ struct TestGatherOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CUDAPlace(i);
         gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new phi::GPUContext(p));
       }
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 767f7b1e48b..ea292712610 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -367,8 +367,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
 
   if (platform::is_gpu_place(tensor->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    tensor_check<platform::CUDADeviceContext>(
-        op_type, var_name, *tensor, place);
+    tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Tensor[%s] use gpu place. PaddlePaddle must compile with GPU.",
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 59bbef3a095..d91225a8141 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -135,7 +135,7 @@ __global__ void CheckNanInfKernel(const T* value,
 
 template <>
 template <typename T>
-void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
+void TensorCheckerVisitor<phi::GPUContext>::apply(
     typename std::enable_if<
         std::is_floating_point<T>::value ||
         std::is_same<T, ::paddle::platform::complex<float>>::value ||
@@ -143,7 +143,7 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
     const {
   int print_num = 3;
 
-  auto* dev_ctx = reinterpret_cast<platform::CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       platform::DeviceContextPool::Instance().Get(tensor_.place()));
   int dev_id = tensor_.place().device;
   PADDLE_ENFORCE_EQ(
@@ -226,13 +226,13 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
 }
 
 template <>
-void tensor_check<platform::CUDADeviceContext>(const std::string& op_type,
-                                               const std::string& var_name,
-                                               const framework::Tensor& tensor,
-                                               const platform::Place& place) {
+void tensor_check<phi::GPUContext>(const std::string& op_type,
+                                   const std::string& var_name,
+                                   const framework::Tensor& tensor,
+                                   const platform::Place& place) {
   std::call_once(init_multi_gpu_op_var_map_flag, InitMultiGPUOpVarMap);
 
-  TensorCheckerVisitor<platform::CUDADeviceContext> vistor(
+  TensorCheckerVisitor<phi::GPUContext> vistor(
       op_type, var_name, tensor, place);
   VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor);
 }
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 3d8cb208017..82f09f51c23 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -184,8 +184,7 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
       dev_ctx.second->Wait();
     }
   } else {
-    auto stream =
-        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
+    auto stream = static_cast<phi::GPUContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
@@ -224,8 +223,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           auto stream =
-              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
-                  ->stream();
+              static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -254,8 +252,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
-                static_cast<platform::CUDADeviceContext *>(pool.Get(place))
-                    ->stream();
+                static_cast<phi::GPUContext *>(pool.Get(place))->stream();
             platform::GpuStreamSync(stream);
 #else
             PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -277,7 +274,7 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          auto stream = static_cast<platform::CUDADeviceContext *>(
+          auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
@@ -318,8 +315,8 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
-      auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
-      VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
+      auto *cuda_dev_ctx = static_cast<phi::GPUContext *>(p.second);
+      VLOG(10) << "phi::GPUContext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -339,7 +336,7 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
     callback();
   } else {
     auto *ctx = dev_ctxes_.at(p);
-    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    auto *cuda_ctx = static_cast<phi::GPUContext *>(ctx);
     cuda_ctx->RecordEvent(events_.at(p.device), callback);
   }
 #else
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 0d957bf8130..ad7888c0654 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -69,7 +69,7 @@ struct TestReduceOpHandle {
       for (int i = 0; i < count; ++i) {
         auto p = p::CUDAPlace(i);
         gpu_list_.push_back(p);
-        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+        ctxs_.emplace_back(new p::phi::GPUContext(p));
       }
       nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
 #else
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index f0c152c34d3..b453e7c4a81 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -77,7 +77,7 @@ struct ScaleLossGradFunctor {
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       OutT cast_coeff = static_cast<OutT>(coeff_);
-      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
+      auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
                    out_data,
                    platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index c4bc5905aca..5f46906cf8e 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -151,7 +151,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                              const int hidden_size,
                              const int expand_embed_dim,
                              const int64_t total_length) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
@@ -235,7 +235,7 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                           const int64_t* gpu_len,
                           int slot_num,
                           int total_len) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
 #ifdef PADDLE_WITH_HIP
@@ -265,7 +265,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
                              const int expand_embed_dim,
                              const int64_t total_length,
                              const int batch_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto slot_lengths_lod = slot_lengths;
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 297f4cb4796..c4cec547bd8 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -223,10 +223,10 @@ class AfsManager {
     delete read_stream;
   }
   int PopenBidirectionalInternal(const char* command,
-                                 FILE*& fp_read,  // NOLINT
-                                 FILE*& fp_write,
-                                 pid_t& pid,  // NOLINT
-                                 bool read,   // NOLINT
+                                 FILE*& fp_read,   // NOLINT
+                                 FILE*& fp_write,  // NOLINT
+                                 pid_t& pid,       // NOLINT
+                                 bool read,        // NOLINT
                                  bool write) {
     std::lock_guard<std::mutex> g(g_flock);
     int fd_read[2];
@@ -440,10 +440,9 @@ class BoxWrapper {
       std::vector<gpuStream_t*> stream_list;
       for (int i = 0; i < platform::GetGPUDeviceCount(); ++i) {
         VLOG(3) << "before get context i[" << i << "]";
-        platform::CUDADeviceContext* context =
-            dynamic_cast<platform::CUDADeviceContext*>(
-                platform::DeviceContextPool::Instance().Get(
-                    platform::CUDAPlace(i)));
+        phi::GPUContext* context = dynamic_cast<phi::GPUContext*>(
+            platform::DeviceContextPool::Instance().Get(
+                platform::CUDAPlace(i)));
         stream_list_[i] = context->stream();
         stream_list.push_back(&stream_list_[i]);
       }
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index ccc3575c42a..e57a02d7299 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -300,7 +300,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
     const int64_t total_length,
     int* gpu_dim,
     int feature_value_size) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
@@ -333,7 +333,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
     size_t grad_value_size,
     std::vector<int>& slot_vector,
     std::vector<int>& slot_mf_dim_vector) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   auto slot_lengths_lod = slot_lengths;
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 0e806fdb5f5..36b789bdd11 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -90,7 +90,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             const int64_t* gpu_len,
                             int slot_num,
                             int total_len) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   CopyKeysKernel<<<(total_len + 1024 - 1) / 1024, 1024, 0, stream>>>(
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index c1f8041cc1e..77a666a24d9 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -78,14 +78,12 @@ DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
     : GarbageCollector(place, max_memory_size) {}
 
 void DefaultStreamGarbageCollector::Wait() const {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->WaitStreamCallback();
+  static_cast<phi::GPUContext *>(this->dev_ctx_)->WaitStreamCallback();
 }
 
 void DefaultStreamGarbageCollector::ClearCallback(
     const std::function<void()> &callback) {
-  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-      ->AddStreamCallback(callback);
+  static_cast<phi::GPUContext *>(this->dev_ctx_)->AddStreamCallback(callback);
 }
 
 StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc
index 73139dee6e0..f5c226631e0 100644
--- a/paddle/fluid/framework/heter_section_worker.cc
+++ b/paddle/fluid/framework/heter_section_worker.cc
@@ -48,8 +48,7 @@ void SetMicroId(paddle::framework::Scope* scope,
     char* temp_ptr = temp.data();
     float* temp_ptr_float = reinterpret_cast<float*>(temp_ptr);
     temp_ptr_float[0] = micro_id;
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(*dev_ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(*dev_ctx).stream();
     memory::Copy(
         place,
         tensor_data,
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index da52af0faf3..0afeecd06b0 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -514,7 +514,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request,
     }
   }
 #ifdef PADDLE_WITH_CUDA
-  auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+  auto* dev_ctx = static_cast<phi::GPUContext*>(
       platform::DeviceContextPool::Instance().Get(place));
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaEventRecord(context->event_, dev_ctx->stream()));
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index ce5f4d743c6..690dea51632 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -229,7 +229,7 @@ void TestMainImpl(std::string func_name,
   device_code.SetWorkloadPerThread(1);
   device_code.Launch(n, &args);
 
-  auto* dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       paddle::platform::DeviceContextPool::Instance().Get(place));
   dev_ctx->Wait();
 
diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc
index cb77542e262..c3c3581a6a7 100644
--- a/paddle/fluid/framework/mixed_vector.cc
+++ b/paddle/fluid/framework/mixed_vector.cc
@@ -38,7 +38,7 @@ void CopyToCPUHelper(std::vector<T> *cpu_,
                      size_t *gpu_memory_size_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // COPY GPU Data To CPU
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get((*gpu_)->place()));
   auto stream = dev_ctx->stream();
   void *src = (*gpu_)->ptr();
@@ -63,7 +63,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory::Alloc(place, *gpu_memory_size_);
   void *dst = (*gpu_)->ptr();
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(place));
   auto stream = dev_ctx->stream();
   paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(),
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 0eaf5dd69a5..61d256ffb22 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -38,7 +38,7 @@ static __global__ void multiply_10(int* ptr) {
 }
 
 gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
-  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
+  return reinterpret_cast<const phi::GPUContext*>(
              paddle::platform::DeviceContextPool::Instance().Get(place))
       ->stream();
 }
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 3680f0aa900..4b72d6bea34 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -854,9 +854,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
   platform::RecordEvent record(
       "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
 
-  gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                           instr.DeviceContext())
-                           .stream();
+  gpuStream_t stream =
+      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
   auto TensorRecordStream = [&stream](Tensor& tensor) {
     auto allocation = tensor.Holder();
     if (allocation == nullptr) {
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index fa0528d4882..9ef577f6285 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -236,9 +236,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
                        paddle::framework::OpKernelTest<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
-    op_with_kernel,
-    paddle::framework::OpKernelTest<paddle::platform::CUDADeviceContext,
-                                    float>);
+    op_with_kernel, paddle::framework::OpKernelTest<phi::GPUContext, float>);
 
 TEST(OperatorRegistrar, CPU) {
   paddle::framework::proto::OpDesc op_desc;
@@ -263,9 +261,9 @@ TEST(OperatorRegistrar, CUDA) {
 }
 
 static int op_test_value = 0;
-using paddle::platform::CUDADeviceContext;
 using paddle::platform::DeviceContext;
 using phi::CPUContext;
+using phi::GPUContext;
 
 namespace paddle {
 namespace framework {
@@ -301,7 +299,7 @@ class OpMultiKernelTest<CPUContext, T> : public paddle::framework::OpKernel<T> {
 };
 
 template <typename T>
-class OpMultiKernelTest<CUDADeviceContext, T>
+class OpMultiKernelTest<phi::GPUContext, T>
     : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -325,7 +323,7 @@ class OpMultiKernelTest2<CPUContext, T>
 };
 
 template <typename T>
-class OpMultiKernelTest2<CUDADeviceContext, T>
+class OpMultiKernelTest2<phi::GPUContext, T>
     : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const {
@@ -351,12 +349,12 @@ REGISTER_OP_KERNEL(
     op_with_multi_kernel,
     CUDA,
     paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+    paddle::framework::OpMultiKernelTest<phi::GPUContext, float>);
 REGISTER_OP_KERNEL(
     op_with_multi_kernel,
     CUDNN,
     paddle::platform::CUDAPlace,
-    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+    paddle::framework::OpMultiKernelTest2<phi::GPUContext, float>);
 
 TEST(OperatorRegistrar, OpWithMultiKernel) {
   paddle::framework::proto::OpDesc op_desc;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2568a459f31..cb6b2d832bf 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -416,13 +416,12 @@ class ExecutionContext {
   }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
+  const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
                       platform::errors::PreconditionNotMet(
                           "Current device context place is not GPUPlace."));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
+    return *reinterpret_cast<const phi::GPUContext*>(&device_context_);
   }
 #endif
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 4ad966887f3..26150b2d04b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -863,12 +863,12 @@ void ParallelExecutor::BCastParamsToDevices(
         nccl_ctxs->WaitAll();
       } else {
         auto src_place = member_->places_[0];
-        auto src_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+        auto src_dev_ctx = static_cast<phi::GPUContext *>(
             platform::DeviceContextPool::Instance().Get(src_place));
         auto sizeof_dtype = framework::SizeOfType(dtype) * numel;
         for (size_t i = 1; i < member_->places_.size(); ++i) {
           auto dst_place = member_->places_[i];
-          auto dst_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          auto dst_dev_ctx = static_cast<phi::GPUContext *>(
               platform::DeviceContextPool::Instance().Get(dst_place));
           src_dev_ctx->Wait();
           dst_dev_ctx->Wait();
@@ -1492,8 +1492,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
         global_scope, member_->places_);
     auto &pool = platform::DeviceContextPool::Instance();
     for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
+      auto *dev_ctx =
+          static_cast<phi::GPUContext *>(pool.Get(member_->places_[dev_id]));
       auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
       dev_ctx->set_nccl_comm(nccl_ctx.comm());
     }
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 6c8e8251579..050a51a0f10 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-struct ConvertToPhiContext<platform::CUDADeviceContext> {
+struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
 };
 #endif
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index dbb549efa25..f7f05da6340 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -261,8 +261,7 @@ void TensorCopyImpl(const TENSOR& src,
                           "place is %s, context place is %s.",
                           src_gpu_place,
                           ctx_gpu_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   }
   else if (platform::is_cpu_place(src_place) &&  // NOLINT
@@ -284,8 +283,7 @@ void TensorCopyImpl(const TENSOR& src,
                           "destination place is %s, context place is %s.",
                           dst_gpu_place,
                           ctx_gpu_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   }
   else if (platform::is_gpu_place(src_place) &&  // NOLINT
@@ -308,8 +306,7 @@ void TensorCopyImpl(const TENSOR& src,
                           "device context GPU number is %d.",
                           src_gpu_place.device,
                           ctx_gpu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   }
@@ -333,8 +330,7 @@ void TensorCopyImpl(const TENSOR& src,
                           "device context GPU number is %d.",
                           dst_gpu_place.device,
                           ctx_gpu_place.device));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     memory::Copy(
         dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
   }
@@ -349,8 +345,7 @@ void TensorCopyImpl(const TENSOR& src,
         platform::errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
       memory::Copy(
           dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -1076,8 +1071,7 @@ void TensorToStream(std::ostream& os,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
-      auto& gpu_dev_ctx =
-          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
       platform::CPUPlace cpu;
       uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
       while (size != 0) {
@@ -1482,13 +1476,12 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
         platform::CUDAPlace(dl_tensor.device.device_id);
     dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place);
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(*ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(*ctx).stream());
   }
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 79532172571..b1bba0f7c35 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -164,13 +164,12 @@ void TensorFromArray(const T* src,
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -242,13 +241,12 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -340,13 +338,12 @@ inline void TensorFromVector(const std::vector<bool>& src,
   }
 #ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src_place,
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src_place,
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -444,13 +441,12 @@ void TensorToVector(const Tensor& src,
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src.place(),
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
@@ -503,13 +499,12 @@ inline void TensorToVector(const Tensor& src,
   }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place,
-        dst_ptr,
-        src.place(),
-        src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    memory::Copy(dst_place,
+                 dst_ptr,
+                 src.place(),
+                 src_ptr,
+                 size,
+                 reinterpret_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
 #if defined(PADDLE_WITH_XPU)
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 74454a5a09b..36be5cde506 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -73,7 +73,7 @@ TEST(TensorCopy, Tensor) {
 
     // CPU Tensor to GPU Tensor
     auto gpu_place = new platform::CUDAPlace(0);
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                              .GetAllocator(*gpu_place, gpu_ctx.stream())
                              .get());
@@ -170,7 +170,7 @@ TEST(TensorFromVector, Tensor) {
     // Copy to GPUTensor
     gpu_tensor.Resize(phi::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
-    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                              .GetAllocator(*gpu_place, gpu_ctx.stream())
                              .get());
@@ -238,7 +238,7 @@ TEST(TensorToVector, Tensor) {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     paddle::framework::Tensor gpu_tensor;
     paddle::platform::CUDAPlace place;
-    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    phi::GPUContext gpu_ctx(place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                              .GetAllocator(place, gpu_ctx.stream())
                              .get());
@@ -255,22 +255,20 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
-TEST(TensorToVector, Tensor_bool) {
-{
-  paddle::framework::Tensor src;
-  bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 3 * 3; ++i) {
-    src_ptr[i] = static_cast<bool>(i % 2);
-  }
+TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src;
+bool* src_ptr = src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+for (int i = 0; i < 3 * 3; ++i) {
+  src_ptr[i] = static_cast<bool>(i % 2);
+}
 
-  paddle::platform::CPUPlace place;
-  std::vector<bool> dst;
-  paddle::framework::TensorToVector<bool>(src, &dst);
+paddle::platform::CPUPlace place;
+std::vector<bool> dst;
+paddle::framework::TensorToVector<bool>(src, &dst);
 
-  for (int i = 0; i < 3 * 3; ++i) {
-    EXPECT_EQ(src_ptr[i], dst[i]);
-  }
+for (int i = 0; i < 3 * 3; ++i) {
+  EXPECT_EQ(src_ptr[i], dst[i]);
 }
+}  // namespace framework
 
 #ifdef PADDLE_WITH_CUDA
 {
@@ -287,7 +285,7 @@ TEST(TensorToVector, Tensor_bool) {
   };
   paddle::framework::Tensor gpu_tensor;
   paddle::platform::CUDAPlace place;
-  paddle::platform::CUDADeviceContext gpu_ctx(place);
+  phi::GPUContext gpu_ctx(place);
   gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(place, gpu_ctx.stream())
                            .get());
@@ -328,7 +326,7 @@ TEST(TensorToVector, Tensor_bool) {
   }
 }
 #endif
-}
+}  // namespace paddle
 
 TEST(TensorFromDLPack, Tensor) {
   {
@@ -525,7 +523,7 @@ TEST(Tensor, FromAndToStream) {
     Tensor dst_tensor;
 
     auto gpu_place = new platform::CUDAPlace();
-    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    phi::GPUContext gpu_ctx(*gpu_place);
     gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                              .GetAllocator(*gpu_place, gpu_ctx.stream())
                              .get());
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index b948a191df7..c9d3d2591d0 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -95,7 +95,7 @@ static void AllReduce(const phi::SelectedRows &src,
 
   auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
   auto nccl_dtype = platform::ToNCCLDataType(dtype);
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(place));
 
   bool use_calc_stream = (dev_ctx->stream() == stream);
@@ -220,7 +220,7 @@ void AllReduce(const framework::Variable &src,
                int ring_id,
                bool use_calc_stream) {
   const auto &place = GetVarPlace(src);
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(place));
   platform::NCCLComm *comm =
       platform::NCCLCommContext::Instance().Get(ring_id, place);
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index b656da34fb6..e6e156fa61c 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -122,10 +122,9 @@ class TensorAddFunctor
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void operator()(const platform::CUDAPlace& place) const {
-    platform::CUDADeviceContext* ctx =
-        dynamic_cast<platform::CUDADeviceContext*>(
-            platform::DeviceContextPool::Instance().Get(place));
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    phi::GPUContext* ctx = dynamic_cast<phi::GPUContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(*ctx);
     blas.AXPY(numel_, 1., x_, y_);
   }
 #else
@@ -433,7 +432,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   if (data_type == framework::proto::VarType::FP16) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
+      return TensorAddImpl<phi::GPUContext, platform::float16>(
           src_tensor, dst_tensor, place);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -450,7 +449,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   if (data_type == framework::proto::VarType::BF16) {
     if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      return TensorAddImpl<platform::CUDADeviceContext, platform::bfloat16>(
+      return TensorAddImpl<phi::GPUContext, platform::bfloat16>(
           src_tensor, dst_tensor, place);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -499,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
   } else {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
@@ -551,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
   } else {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
@@ -614,8 +613,8 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
-    PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
-    PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
+    PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
+    PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
   } else {
 #endif
     PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float);
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 99c4a02e82b..94ac86e97e1 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -85,7 +85,7 @@ void NCCLParallelContext::Init() {
     VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
             << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
             << " ring id: " << ring_id;
-    // it will assign nccl_comm in CUDADeviceContext within ring_id
+    // it will assign nccl_comm in phi::GPUContext within ring_id
     platform::NCCLCommContext::Instance().CreateComm(&nccl_ids[ring_id],
                                                      strategy_.nranks_,
                                                      strategy_.local_rank_,
@@ -119,7 +119,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
   VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
           << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
           << " ring id: " << ring_id;
-  // it will assign nccl_comm in CUDADeviceContext within ring_id
+  // it will assign nccl_comm in phi::GPUContext within ring_id
   platform::NCCLCommContext::Instance().CreateComm(
       &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
 
@@ -177,7 +177,7 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
                         ring_id,
                         compute_events_.size()));
 
-  auto compute_stream = static_cast<platform::CUDADeviceContext *>(
+  auto compute_stream = static_cast<phi::GPUContext *>(
                             platform::DeviceContextPool::Instance().Get(place_))
                             ->stream();
   auto comm_stream =
@@ -207,7 +207,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
                         ring_id,
                         comm_events_.size()));
 
-  auto compute_stream = static_cast<platform::CUDADeviceContext *>(
+  auto compute_stream = static_cast<phi::GPUContext *>(
                             platform::DeviceContextPool::Instance().Get(place_))
                             ->stream();
   auto comm_stream =
@@ -225,7 +225,7 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 }
 
 void NCCLParallelContext::SynchronizeCompute() {
-  auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *compute_dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(place_));
   compute_dev_ctx->Wait();
 }
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 468263e7be7..1c3165a4538 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -283,11 +283,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    ConcatTensorsWithType(
-        static_cast<const platform::CUDADeviceContext &>(context),
-        dense_tensors_,
-        &dense_contents_,
-        dtype_);
+    ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
+                          dense_tensors_,
+                          &dense_contents_,
+                          dtype_);
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't concat grad tensors since it's not compiled with NCCL,"
@@ -344,11 +343,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    SplitTensorsWithType(
-        static_cast<const platform::CUDADeviceContext &>(context),
-        &dense_contents_,
-        &dense_tensors_,
-        dtype_);
+    SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
+                         &dense_contents_,
+                         &dense_tensors_,
+                         dtype_);
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Paddle can't split grad tensor since it's not compiled with NCCL,"
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index 5b29e568089..a3f840f38bf 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -27,13 +27,10 @@ void Group::DivNRanks(framework::Tensor *tensor,
         "Unsupport BF16 in DataParallel for now"));
   }
   framework::VisitDataTypeForHIP(
-      dtype_,
-      DivNRanksForAllReduce<platform::CUDADeviceContext>(
-          tensor, nranks, context));
+      dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
 #else
-  framework::VisitDataType(dtype_,
-                           DivNRanksForAllReduce<platform::CUDADeviceContext>(
-                               tensor, nranks, context));
+  framework::VisitDataType(
+      dtype_, DivNRanksForAllReduce<phi::GPUContext>(tensor, nranks, context));
 #endif
 }
 #endif
diff --git a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
index 67059916d03..597a9a64669 100644
--- a/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/heter_ccl_context_test.cc
@@ -39,7 +39,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 void AllReduceByStream(int local_rank, int device_id) {
   int data_size = 32;
   const auto& place = platform::CUDAPlace(device_id);
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
 
   // heter_parallel_ctx
   imperative::HeterParallelContext hpc(GetStrategy(local_rank), device_id);
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 89938d2d7a2..13843ddbe5c 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -78,7 +78,7 @@ void Broadcast(int local_rank, int device_id) {
   int data_size = 4;
   float test_data = 7;
   const auto& place = platform::CUDAPlace(device_id);
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
 
   imperative::NCCLParallelContext npc(GetStrategy(local_rank), place);
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 8971448071f..bde92c13b4c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -194,8 +194,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
                           "Only one choice can be made between CPU and XPU."));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
     memory::Copy(dst_gpu_place,
                  static_cast<void *>(input_ptr),
@@ -283,7 +282,7 @@ bool AnalysisPredictor::Init(
     // NOTE: If the external_stream equals to global_device_contexts's stream,
     // then fallback.
     auto global_stream =
-        static_cast<platform::CUDADeviceContext *>(
+        static_cast<phi::GPUContext *>(
             platform::DeviceContextPool::Instance().Get(place_))
             ->stream();
     if (predictor_stream_ != global_stream) {
@@ -1658,8 +1657,7 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     auto gpu_place = place_;
-    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
-        pool.Get(gpu_place));
+    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(gpu_place));
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
 #else
@@ -2331,8 +2329,7 @@ void InternalUtils::SyncStream(paddle_infer::Predictor *p) {
   auto *pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
   paddle::platform::DeviceContextPool &pool =
       paddle::platform::DeviceContextPool::Instance();
-  auto *dev_ctx = reinterpret_cast<paddle::platform::CUDADeviceContext *>(
-      pool.Get(pred->place_));
+  auto *dev_ctx = reinterpret_cast<phi::GPUContext *>(pool.Get(pred->place_));
   cudaStreamSynchronize(dev_ctx->stream());
 #endif
 }
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 34dade3628a..2ba806a0529 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -248,8 +248,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
-      auto *dev_ctx =
-          static_cast<const platform::CUDADeviceContext *>(pool.Get(place_));
+      auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
       auto dst_gpu_place = place_;
       memory::Copy(dst_gpu_place,
                    static_cast<void *>(input_ptr),
diff --git a/paddle/fluid/inference/api/paddle_infer_contrib.cc b/paddle/fluid/inference/api/paddle_infer_contrib.cc
index 171db0807e7..51b27f8ca3a 100644
--- a/paddle/fluid/inference/api/paddle_infer_contrib.cc
+++ b/paddle/fluid/inference/api/paddle_infer_contrib.cc
@@ -158,8 +158,7 @@ void TensorUtils::CopyTensorImpl(Tensor* p_dst,
     paddle::platform::DeviceContextPool& pool =
         paddle::platform::DeviceContextPool::Instance();
     paddle::platform::CUDAPlace gpu_place(dst.device_);
-    auto* dev_ctx = static_cast<const paddle::platform::CUDADeviceContext*>(
-        pool.Get(gpu_place));
+    auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.Get(gpu_place));
 
     if (src.place() == PlaceType::kCPU) {
       paddle::memory::Copy(gpu_place,
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index e9ffbbd4494..454cd49d3ab 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -139,13 +139,12 @@ void MemoryCopyAsync(const platform::Place& dst_place,
     } else if (platform::is_gpu_place(dst_place) &&
                platform::is_gpu_place(src_place)) {
       auto gpu_place = src_place;
-      memory::Copy(
-          gpu_place,
-          dst_data,
-          gpu_place,
-          src_data,
-          size,
-          static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+      memory::Copy(gpu_place,
+                   dst_data,
+                   gpu_place,
+                   src_data,
+                   size,
+                   static_cast<const phi::GPUContext&>(ctx).stream());
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc
index 45b9d222c4c..e4054c5df67 100644
--- a/paddle/fluid/inference/lite/test_engine_lite.cc
+++ b/paddle/fluid/inference/lite/test_engine_lite.cc
@@ -74,7 +74,7 @@ void make_fake_model(std::string* model, std::string* param) {
   framework::Scope scope;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index 43e1d8770c3..eea51e8ff1e 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -118,8 +118,7 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::GpuStreamSync(
-        static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    platform::GpuStreamSync(static_cast<const phi::GPUContext&>(ctx).stream());
   }
 #endif
   std::vector<float> result;
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
index d770ef5478a..06555114164 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -68,7 +68,7 @@ TEST(EngineIOConverterTester, DefaultCPU) {
 
 TEST(EngineIOConverterTester, DefaultGPU) {
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   IOConverterTester(ctx);
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 2cdf3623306..9b80aeb1d49 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -124,7 +124,7 @@ class TRTConvertValidation {
   }
 
   void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDADeviceContext ctx(place_);
+    phi::GPUContext ctx(place_);
 
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
@@ -172,7 +172,7 @@ class TRTConvertValidation {
                           "But received batch_size:%d, max_batch_size_:%d",
                           batch_size,
                           max_batch_size_));
-    platform::CUDADeviceContext ctx(place_);
+    phi::GPUContext ctx(place_);
     op_->Run(scope_, place_);
     cudaStreamSynchronize(stream_);
     std::vector<std::string> input_output_names;
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 499c21723fe..9602e6c8790 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -347,11 +347,11 @@ int QkvToContextPluginDynamic::enqueue(
     TransposeQKV(
         batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
 
-    auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *device_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(
             platform::CUDAPlace(device_id)));
 
-    const platform::CUDADeviceContext &dev_ctx = *device_ctx;
+    const phi::GPUContext &dev_ctx = *device_ctx;
     operators::math::MultiHeadGPUComputeFunctor<float> multihead_compute_func;
     multihead_compute_func(dev_ctx,
                            batch,
@@ -403,7 +403,7 @@ int QkvToContextPluginDynamic::enqueue(
     TransposeQKV(
         batch, seq_len, head_size_, head_number_, input0_data, tptr, stream);
 
-    auto *device_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *device_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(
             platform::CUDAPlace(device_id)));
 
@@ -414,7 +414,7 @@ int QkvToContextPluginDynamic::enqueue(
     apply_scale<<<blocks, threads, 0, stream>>>(
         tptr, static_cast<half>(scale_), n_q);
 
-    const platform::CUDADeviceContext &dev_ctx = *device_ctx;
+    const phi::GPUContext &dev_ctx = *device_ctx;
     operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
     multihead_compute_func(dev_ctx,
                            batch,
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 97b97aa3a4b..6ac23e32856 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -34,7 +34,7 @@ namespace tensorrt {
 class TensorRTDynamicEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -94,7 +94,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
   framework::Tensor input_;
   framework::Tensor output_;
   TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };
 
 TEST_F(TensorRTDynamicEngineTest, test_spmm) {
@@ -199,7 +199,7 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) {
 class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -279,7 +279,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
   std::vector<framework::Tensor> inputs_;
   std::vector<framework::Tensor> outputs_;
   TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };
 
 TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 2836295f006..dc8065ab2a6 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -26,7 +26,7 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
                            .get());
@@ -69,7 +69,7 @@ class TensorRTEngineTest : public ::testing::Test {
   framework::Tensor input_;
   framework::Tensor output_;
   TensorRTEngine *engine_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };
 
 TEST_F(TensorRTEngineTest, add_layer) {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index 57782494eaf..44bcc10abae 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -44,7 +44,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
   platform::CUDAPlace gpu(0);
-  platform::CUDADeviceContext dev_ctx(gpu);
+  phi::GPUContext dev_ctx(gpu);
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu, dev_ctx.stream())
                            .get());
@@ -64,8 +64,7 @@ TEST(BestFitAllocator, concurrent_cuda) {
       size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
 
       ForEachFill fill(data);
-      platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                                allocate_size);
+      platform::ForRange<phi::GPUContext> for_range(dev_ctx, allocate_size);
       for_range(fill);
 
       memory::Copy(platform::CPUPlace(),
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 662bcc401bd..f7e74e04212 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -29,53 +29,51 @@ namespace memory {
 namespace allocation {
 
 /**
- * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
- * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
- * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
+ * GPUContextAllocation is a wrapper of the underbeneath allocation.
+ * GPUContextAllocation adds a CUDA stream callback for the underbeneath
+ * allocation so that GPUContextAllocation can be used in a CUDA stream
  * which deletes allocation in the callback.
  */
-class CUDADeviceContextAllocation : public Allocation {
+class GPUContextAllocation : public Allocation {
  public:
-  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
+  explicit GPUContextAllocation(DecoratedAllocationPtr allocation)
       : Allocation(allocation->ptr(),
                    allocation->base_ptr(),
                    allocation->size(),
                    allocation->place()),
         underlying_allocation_(std::move(allocation)) {}
 
-  ~CUDADeviceContextAllocation() {
+  ~GPUContextAllocation() {
     PADDLE_ENFORCE_NOT_NULL(
         dev_ctx_,
         platform::errors::PreconditionNotMet(
-            "Device context is not set for CUDADeviceContextAllocation"));
+            "Device context is not set for GPUContextAllocation"));
     auto *p_allocation = underlying_allocation_.release();
-    VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
+    VLOG(4) << "Adding callback to delete GPUContextAllocation at "
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
-      VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
+      VLOG(4) << "Delete GPUContextAllocation at " << p_allocation;
       Allocator::AllocationDeleter(p_allocation);
     });
   }
 
-  void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
-    dev_ctx_ = dev_ctx;
-  }
+  void SetGPUContext(const phi::GPUContext *dev_ctx) { dev_ctx_ = dev_ctx; }
 
  private:
   DecoratedAllocationPtr underlying_allocation_;
-  const platform::CUDADeviceContext *dev_ctx_{nullptr};
+  const phi::GPUContext *dev_ctx_{nullptr};
 };
 
 /**
- * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
+ * GPUContextAllocator will allocate a GPUContextAllocation
  * after waiting for a self-created event on the default stream. It does so to
  * let the non-default stream be able to allocate GPU memory which will be
  * released by stream callback
  */
-class CUDADeviceContextAllocator : public Allocator {
+class GPUContextAllocator : public Allocator {
  public:
-  explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
-                                      gpuStream_t default_stream)
+  explicit GPUContextAllocator(platform::CUDAPlace place,
+                               gpuStream_t default_stream)
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
@@ -87,7 +85,7 @@ class CUDADeviceContextAllocator : public Allocator {
 #endif
   }
 
-  ~CUDADeviceContextAllocator() {
+  ~GPUContextAllocator() {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
@@ -103,9 +101,9 @@ class CUDADeviceContextAllocator : public Allocator {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
-            "Default stream is not set for CUDADeviceContextAllocator"));
+            "Default stream is not set for GPUContextAllocator"));
     platform::CUDADeviceGuard guard(place_.device);
-    auto allocation = new CUDADeviceContextAllocation(
+    auto allocation = new GPUContextAllocation(
         static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
@@ -127,20 +125,20 @@ class CUDADeviceContextAllocator : public Allocator {
 };
 
 /**
- * CUDADeviceContextAllocatorPool is a singletion stores mapping from
- * CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
- * CUDADeviceContext's compute stream isn't default stream, it can call this
+ * GPUContextAllocatorPool is a singletion stores mapping from
+ * CUDAPlace(s) to std::shared_ptr<GPUContextAllocator>. When a
+ * phi::GPUContext's compute stream isn't default stream, it can call this
  * class to allocate GPU memory which will be released by a callback after
  * stream execution.
  */
-class CUDADeviceContextAllocatorPool {
+class GPUContextAllocatorPool {
  public:
-  static CUDADeviceContextAllocatorPool &Instance() {
-    static CUDADeviceContextAllocatorPool pool;
+  static GPUContextAllocatorPool &Instance() {
+    static GPUContextAllocatorPool pool;
     return pool;
   }
 
-  AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
+  AllocationPtr Alloc(const phi::GPUContext &dev_ctx, size_t size) {
     auto iter =
         allocators_.find(platform::CUDAPlace(dev_ctx.GetPlace().GetDeviceId()));
     PADDLE_ENFORCE_NE(
@@ -149,25 +147,25 @@ class CUDADeviceContextAllocatorPool {
         platform::errors::NotFound("No allocator found for CUDAPlace."));
     auto &allocator = iter->second;
     AllocationPtr allocation = allocator->Allocate(size);
-    static_cast<CUDADeviceContextAllocation *>(allocation.get())
-        ->SetCUDADeviceContext(&dev_ctx);
+    static_cast<GPUContextAllocation *>(allocation.get())
+        ->SetGPUContext(&dev_ctx);
     return allocation;
   }
 
  private:
-  CUDADeviceContextAllocatorPool() {
+  GPUContextAllocatorPool() {
     std::vector<int> devices = platform::GetSelectedDevices();
     for (int i : devices) {
       auto place = platform::CUDAPlace(i);
       auto compute_stream =
           platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
-      auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
-          new CUDADeviceContextAllocator(place, compute_stream));
+      auto allocator = std::shared_ptr<GPUContextAllocator>(
+          new GPUContextAllocator(place, compute_stream));
       allocators_.insert(make_pair(place, allocator));
     }
   }
 
-  std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
+  std::map<platform::CUDAPlace, std::shared_ptr<GPUContextAllocator>>
       allocators_;
 };
 
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 05e712e72f2..b3308ffdd30 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -37,7 +37,7 @@ const int NUM_STREAMS = 8;
 const int N = 2;
 const float DELTA = 1e-1;
 
-using CudaDevCtxVec = std::vector<std::unique_ptr<platform::CUDADeviceContext>>;
+using CudaDevCtxVec = std::vector<std::unique_ptr<phi::GPUContext>>;
 
 __global__ void kernel(float *x, int n) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
@@ -65,7 +65,7 @@ void CheckKernelOutput(float *x, int n) {
 
 void MultiStreamCompute(float **data,
                         float **second_data,
-                        const platform::CUDADeviceContext &ctx) {
+                        const phi::GPUContext &ctx) {
   // multi-streams
   AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
   EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
@@ -88,7 +88,7 @@ void MultiStreamCompute(float **data,
 #endif
 }
 
-TEST(Malloc, CUDADeviceContextMultiStream) {
+TEST(Malloc, GPUContextMultiStream) {
   auto place = platform::CUDAPlace(0);
   platform::SetDeviceId(0);
 
@@ -110,8 +110,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place));
+    auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
     ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(place, ctx->stream())
                           .get());
@@ -143,7 +142,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   }
 }
 
-TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
+TEST(Malloc, GPUContextMultiThreadMultiStream) {
   auto place = platform::CUDAPlace(0);
   platform::SetDeviceId(0);
 
@@ -166,8 +165,7 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
-    auto ctx = std::unique_ptr<platform::CUDADeviceContext>(
-        new platform::CUDADeviceContext(place));
+    auto ctx = std::unique_ptr<phi::GPUContext>(new phi::GPUContext(place));
     ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(place, ctx->stream())
                           .get());
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index 96831b6bafc..67f2df8cda5 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -65,7 +65,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
   allocation_implicit_stream.reset();
 
   gpuStream_t default_stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(place))
           ->stream();
   allocation::AllocationPtr allocation_unique =
@@ -143,7 +143,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
   size_t alloc_size = 256;
 
   gpuStream_t default_stream =
-      dynamic_cast<platform::CUDADeviceContext *>(
+      dynamic_cast<phi::GPUContext *>(
           paddle::platform::DeviceContextPool::Instance().Get(place))
           ->stream();
   std::shared_ptr<Allocation> allocation_implicit_stream =
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index fae675142bc..49f78715c2c 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -19,8 +19,8 @@
 namespace paddle {
 namespace operators {
 using framework::Tensor;
+using phi::GPUContext;
 using platform::ActivationDescriptor;
-using platform::CUDADeviceContext;
 using platform::TensorDescriptor;
 
 #ifdef PADDLE_WITH_HIP
@@ -39,12 +39,12 @@ template <typename T>
 struct CudnnActivationFunctor {
   using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
-  CudnnActivationFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationFunctor(const phi::GPUContext& ctx,
                          const T& c,
                          const miopenActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
 #else
-  CudnnActivationFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationFunctor(const phi::GPUContext& ctx,
                          const T& c,
                          const cudnnActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
@@ -77,7 +77,7 @@ struct CudnnActivationFunctor {
         out->mutable_data<T>(ctx_.GetPlace())));
 #endif
   }
-  const CUDADeviceContext& ctx_;
+  const phi::GPUContext& ctx_;
   const T coef_;
 #ifdef PADDLE_WITH_HIP
   const miopenActivationMode_t mode_;
@@ -90,12 +90,12 @@ template <typename T>
 struct CudnnActivationGradFunctor {
   using ELEMENT_TYPE = T;
 #ifdef PADDLE_WITH_HIP
-  CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationGradFunctor(const phi::GPUContext& ctx,
                              const T& c,
                              const miopenActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
 #else
-  CudnnActivationGradFunctor(const CUDADeviceContext& ctx,
+  CudnnActivationGradFunctor(const phi::GPUContext& ctx,
                              const T& c,
                              const cudnnActivationMode_t& m)
       : ctx_(ctx), coef_(c), mode_(m) {}
@@ -141,7 +141,7 @@ struct CudnnActivationGradFunctor {
         dx->mutable_data<T>(ctx_.GetPlace())));
 #endif
   }
-  const CUDADeviceContext& ctx_;
+  const phi::GPUContext& ctx_;
   const T coef_;
 #ifdef PADDLE_WITH_HIP
   const miopenActivationMode_t mode_;
@@ -152,12 +152,12 @@ struct CudnnActivationGradFunctor {
 
 template <typename T>
 struct CudnnReluFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnReluFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnReluFunctor(const phi::GPUContext& ctx)
       : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 };
 template <typename T>
 struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnReluGradFunctor(const phi::GPUContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {}
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -167,12 +167,12 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
 
 template <typename T>
 struct CudnnRelu6Functor : public CudnnActivationFunctor<T> {
-  explicit CudnnRelu6Functor(const CUDADeviceContext& ctx)
+  explicit CudnnRelu6Functor(const phi::GPUContext& ctx)
       : CudnnActivationFunctor<T>(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 };
 template <typename T>
 struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnRelu6GradFunctor(const phi::GPUContext& ctx)
       : CudnnActivationGradFunctor<T>(
             ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {}
 
@@ -183,12 +183,12 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
 
 template <typename T>
 struct CudnnSigmoidFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnSigmoidFunctor(const phi::GPUContext& ctx)
       : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 };
 template <typename T>
 struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnSigmoidGradFunctor(const phi::GPUContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {}
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -198,12 +198,12 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
 
 template <typename T>
 struct CudnnTanhFunctor : public CudnnActivationFunctor<T> {
-  explicit CudnnTanhFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnTanhFunctor(const phi::GPUContext& ctx)
       : CudnnActivationFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 };
 template <typename T>
 struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
-  explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
+  explicit CudnnTanhGradFunctor(const phi::GPUContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {}
 
   static constexpr ActBwdOpFwdDeps FwdDeps() {
@@ -221,7 +221,7 @@ class CudnnActivationKernel
     framework::Tensor* Out = nullptr;
     ExtractActivationTensor(context, &X, &Out);
     Out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
     Functor functor(dev_ctx);
     functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out);
   }
@@ -242,7 +242,7 @@ class CudnnActivationGradKernel
     ExtractActivationGradTensor<Functor::FwdDeps()>(
         context, &X, &Out, &dOut, &dX);
     dX->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
     Functor functor(dev_ctx);
     functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"),
             GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"),
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 9aafb70c7dc..76a05aa37a6 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -194,87 +194,74 @@ using CudaELUGradNegativeAlphaFunctor =
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(                               \
-    act_type, op_name, functor, grad_functor)                          \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type,                                                        \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<float>>,                  \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<double>>,                 \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::float16>>,          \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::bfloat16>>);        \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type##_grad,                                                 \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<float>>,         \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<double>>,        \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<plat::float16>>, \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(                                       \
+    act_type, op_name, functor, grad_functor)                                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type,                                                                \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
+      ops::ActivationCudaKernel<phi::GPUContext,                               \
+                                ops::functor<plat::bfloat16>>);                \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                     ops::grad_functor<plat::bfloat16>>);
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                           \
-    act_type, op_name, functor, grad_functor)                          \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type,                                                        \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<float>>,                  \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<double>>,                 \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<int>>,                    \
-      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
-                                ops::functor<int64_t>>,                \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::float16>>,          \
-      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
-                                ops::functor<plat::bfloat16>>);        \
-  REGISTER_OP_CUDA_KERNEL(                                             \
-      act_type##_grad,                                                 \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<float>>,         \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<double>>,        \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<int>>,           \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<int64_t>>,       \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
-                                    ops::grad_functor<plat::float16>>, \
-      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                                   \
+    act_type, op_name, functor, grad_functor)                                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type,                                                                \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int>>,           \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int64_t>>,       \
+      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
+      ops::ActivationCudaKernel<phi::GPUContext,                               \
+                                ops::functor<plat::bfloat16>>);                \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<phi::GPUContext, ops::grad_functor<int>>,  \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<int64_t>>,               \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
+                                    ops::grad_functor<plat::float16>>,         \
+      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                     ops::grad_functor<plat::bfloat16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     relu6,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<float>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<double>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<int>>,
-    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
-                              ops::CudaRelu6Functor<int64_t>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<float>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<double>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int>>,
+    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int64_t>>,
+    ops::ActivationCudaKernel<phi::GPUContext,
                               ops::CudaRelu6Functor<plat::float16>>,
-    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationCudaKernel<phi::GPUContext,
                               ops::CudaRelu6Functor<plat::bfloat16>>);
 REGISTER_OP_CUDA_KERNEL(
     relu6_grad,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<float>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<double>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<int>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<int64_t>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<plat::float16>>,
-    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+    ops::ActivationGradCudaKernel<phi::GPUContext,
                                   ops::CudaRelu6GradFunctor<plat::bfloat16>>);
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index f638f6943ff..8fcdb323884 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -211,7 +211,7 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 
 REGISTER_OP_CUDA_KERNEL(affine_channel,
                         ops::AffineChannelCUDAKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index a5b3f9fcfda..48832ac1d6d 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -35,7 +35,7 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "Only support for CUDAPlace.Please switch your context from "
             "CPUPlace to CUDAPlace or update your cudnn."));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto* theta = ctx.Input<Tensor>("Theta");
     auto* output = ctx.Output<Tensor>("Output");
@@ -83,7 +83,7 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
                           "support for CUDAPlace. Please switch "
                           "your context from CPUPlace to "
                           "CUDAPlace or update your cudnn."));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
     auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index b1ed3835e75..a5d4c6484a1 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -29,7 +29,7 @@ __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
 }
 
 template <typename T>
-struct Linspace<paddle::platform::CUDADeviceContext, T> {
+struct Linspace<phi::GPUContext, T> {
   void operator()(T start,
                   T end,
                   int count,
@@ -191,7 +191,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
       w = size_attr[3];
     }
     T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    phi::funcs::SetConstant<phi::GPUContext, T>()(
         ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
 
     T h_step;
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 0338fb5d2f2..35b667825af 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -82,7 +82,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
 
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     const auto xs = ctx.MultiInput<framework::Tensor>("X");
     const auto* scale = ctx.Input<framework::Tensor>("Scale");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
@@ -92,8 +92,7 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
 
     framework::Tensor inverse_scale =
-        ctx.AllocateTmpTensor<MPDType, platform::CUDADeviceContext>({1},
-                                                                    dev_ctx);
+        ctx.AllocateTmpTensor<MPDType, phi::GPUContext>({1}, dev_ctx);
     MPDType* inverse_scale_v = inverse_scale.template data<MPDType>();
 
     InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index d76dd13e5bc..4c927066892 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -87,11 +87,9 @@ __global__ void FusedFillIf(T** outs,
 }
 
 template <typename T, bool IsFoundInfOnCPU>
-class UpdateLossScalingFunctor<platform::CUDADeviceContext,
-                               T,
-                               IsFoundInfOnCPU> {
+class UpdateLossScalingFunctor<phi::GPUContext, T, IsFoundInfOnCPU> {
  public:
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const bool* found_inf_data,
                   const T* pre_loss_scaling_data,
                   const int* good_in_data,
@@ -134,9 +132,9 @@ class UpdateLossScalingFunctor<platform::CUDADeviceContext,
 };
 
 template <typename T>
-class LazyZeros<platform::CUDADeviceContext, T> {
+class LazyZeros<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+  void operator()(const phi::GPUContext& dev_ctx,
                   const bool* found_inf_data,
                   const std::vector<const framework::Tensor*>& xs,
                   const std::vector<framework::Tensor*>& outs) const {
@@ -204,7 +202,7 @@ class LazyZeros<platform::CUDADeviceContext, T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using GPU = paddle::platform::CUDADeviceContext;
+using GPU = phi::GPUContext;
 
 REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
                         ops::UpdateLossScalingKernel<GPU, float>,
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index a2af64e2276..5fee66d968b 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -54,7 +54,7 @@ struct ArrayToLoDFunctor : public std::unary_function<platform::Place, void> {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      Apply(static_cast<platform::CUDADeviceContext *>(pool.Get(place)));
+      Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
           platform::errors::Unavailable("Paddle is not compiled with CUDA."));
diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu
index 164744527e2..3f36e8b1347 100644
--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
@@ -82,8 +82,7 @@ class AssignPosCUDAKernel : public framework::OpKernel<T> {
           *eff_num_len, platform::CPUPlace(), &cpu_eff_num_len);
       cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
     }
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
     framework::DDim out_dims = phi::make_ddim({cpu_eff_num_len_data});
     auto out_data = out->mutable_data<T>(out_dims, place);
 
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index e97f1261845..362489e51ac 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -114,9 +114,9 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
     T* out_data = output->mutable_data<T>(ctx.GetPlace());
     // initialize
     auto out_eigen = framework::EigenVector<T>::Flatten(*output);
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto& place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
     out_eigen.device(place) = out_eigen.constant(static_cast<T>(0));
 
     CBLAS_TRANSPOSE transA = CblasNoTrans;
@@ -127,7 +127,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
     int64_t strideA = ins_num * in_dim;
     int64_t strideB = in_dim * out_dim;
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     blas.BatchedGEMM(transA,
                      transB,
                      ins_num,
@@ -169,9 +169,9 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
     auto in_dim = input_dims[2];
     auto out_dim = w_dims[2];
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto& place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
     // initialize
     dx->mutable_data<T>(ctx.GetPlace());
     auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
@@ -199,7 +199,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
                      out_dim,
                      db_data);
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     T alpha = 1;
     T beta = 0;
 
@@ -238,7 +238,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
+using GPUCtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(batch_fc,
                         ops::BatchFCCUDAKernel<GPUCtx, float>,
                         ops::BatchFCCUDAKernel<GPUCtx, double>);
diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc
index 15aca070221..93f538e6789 100644
--- a/paddle/fluid/operators/beam_search_op.cu.cc
+++ b/paddle/fluid/operators/beam_search_op.cu.cc
@@ -17,9 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    beam_search,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BeamSearchOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(beam_search,
+                        ops::BeamSearchOpKernel<phi::GPUContext, float>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, double>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, int>,
+                        ops::BeamSearchOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index a4fa631f741..7afb3f1135d 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_OP_CUDA_KERNEL(transfer_dtype,
                         ops::CastOpKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index bec1bb662de..2548b135591 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -150,7 +150,7 @@ class CenterLossCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
+using GPUCtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(center_loss,
                         ops::CenterLossCUDAKernel<GPUCtx, float>,
                         ops::CenterLossCUDAKernel<GPUCtx, double>);
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
index afa350ef116..ae9dd3401fd 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cu.cc
@@ -17,8 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(
     cinn_instruction_run,
-    ops::CinnInstructionRunOpKernel<CUDADeviceContext, float>);
+    ops::CinnInstructionRunOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 64980dfb013..7dbf2fee0c2 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -18,6 +18,4 @@ limitations under the License. */
 
 /* see [Why use single type kernel] */
 REGISTER_OP_CUDA_KERNEL(
-    cinn_launch,
-    paddle::operators::CinnLaunchOpKernel<paddle::platform::CUDADeviceContext,
-                                          float>);
+    cinn_launch, paddle::operators::CinnLaunchOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.cc b/paddle/fluid/operators/cinn/cinn_op_helper.cc
index 26fee2d9e57..48efa5c5116 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.cc
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.cc
@@ -21,10 +21,8 @@ namespace paddle::operators::details {
 
 #ifdef PADDLE_WITH_CUDA
 template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx) {
-  const auto& dev_ctx =
-      ctx.template device_context<platform::CUDADeviceContext>();
+void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx) {
+  const auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
   return dev_ctx.stream();
 }
 #endif
diff --git a/paddle/fluid/operators/cinn/cinn_op_helper.h b/paddle/fluid/operators/cinn/cinn_op_helper.h
index 55ee3789c0a..4387095fefa 100644
--- a/paddle/fluid/operators/cinn/cinn_op_helper.h
+++ b/paddle/fluid/operators/cinn/cinn_op_helper.h
@@ -40,8 +40,7 @@ void* GetStream(const framework::ExecutionContext& ctx) {
 
 #ifdef PADDLE_WITH_CUDA
 template <>
-void* GetStream<platform::CUDADeviceContext>(
-    const framework::ExecutionContext& ctx);
+void* GetStream<phi::GPUContext>(const framework::ExecutionContext& ctx);
 #endif
 
 }  // namespace details
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index a0642694843..b92062b1aee 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -375,7 +375,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
             platform::NCCLCommContext::Instance().Get(rid, ctx.GetPlace());
         // use global calculate stream
         const auto calcu_stream =
-            static_cast<platform::CUDADeviceContext*>(
+            static_cast<phi::GPUContext*>(
                 platform::DeviceContextPool::Instance().Get(ctx.GetPlace()))
                 ->stream();
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
@@ -607,6 +607,5 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     class_center_sample,
-    ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext,
-                                     int64_t>,
-    ops::ClassCenterSampleCUDAKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int64_t>,
+    ops::ClassCenterSampleCUDAKernel<phi::GPUContext, int>);
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 561d2696fef..4a11e6d5723 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -519,11 +519,10 @@ REGISTER_OP_CPU_KERNEL(coalesce_tensor,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     coalesce_tensor,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::CoalesceTensorOpKernel<phi::GPUContext, plat::float16>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, int>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, float>,
+    ops::CoalesceTensorOpKernel<phi::GPUContext, double>);
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc
index af299fc6b5a..174a5afa69d 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
@@ -17,10 +17,9 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    allreduce,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(allreduce,
+                        ops::AllReduceOpKernel<phi::GPUContext, float>,
+                        ops::AllReduceOpKernel<phi::GPUContext, double>,
+                        ops::AllReduceOpKernel<phi::GPUContext, int>,
+                        ops::AllReduceOpKernel<phi::GPUContext, int64_t>,
+                        ops::AllReduceOpKernel<phi::GPUContext, plat::float16>);
diff --git a/paddle/fluid/operators/collective/allreduce_op.h b/paddle/fluid/operators/collective/allreduce_op.h
index 12708ab666d..12507d76fe7 100644
--- a/paddle/fluid/operators/collective/allreduce_op.h
+++ b/paddle/fluid/operators/collective/allreduce_op.h
@@ -38,7 +38,7 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
                       platform::errors::PreconditionNotMet(
                           "AllReduce op can run on gpu place only for now."));
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
 
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index ef59772b173..718f60c7737 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -47,7 +47,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index c59d8315a36..de15395eb4d 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -40,7 +40,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     int rid = ctx.Attr<int>("ring_id");
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-    auto stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    auto stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     ncclRedOp_t nccl_red_type = ncclSum;
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
diff --git a/paddle/fluid/operators/collective/broadcast_op.cu.cc b/paddle/fluid/operators/collective/broadcast_op.cu.cc
index ceac881bff1..4f21dc2992a 100644
--- a/paddle/fluid/operators/collective/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/broadcast_op.cu.cc
@@ -54,7 +54,7 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
         platform::errors::PreconditionNotMet("Currently, the broadcast op can "
                                              "only be an In-Place operation."));
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto comm = dev_ctx.nccl_comm();
     auto stream = dev_ctx.stream();
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index e9228a28dba..8356bbb65a8 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -68,7 +68,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index e679fb2fe9c..718c77aaa6f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -419,7 +419,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 33e320816de..e43c67d7bf3 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -54,7 +54,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index 3fb2047dc27..74bdd2b63ae 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -90,7 +90,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
       T* recv_buff = temp_out.data<T>();
       gpuStream_t stream = nullptr;
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
 
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclAllGather(send_buff,
@@ -113,9 +113,9 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
       offset += rows_per_tensor;
     }
 
-    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    math::ConcatFunctor<phi::GPUContext, T> functor;
     out->mutable_data<T>(out_dims, place);
-    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx2 = ctx.template device_context<phi::GPUContext>();
     functor(dev_ctx2, inputs, axis, out);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index 3f14c0ac9c1..53aef8e8357 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -91,8 +91,7 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
     auto *ids_t = context.Input<LoDTensor>("Ids");
     auto *output_t = context.Output<LoDTensor>("Out");
 
-    const auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto &dev_ctx = context.template device_context<phi::GPUContext>();
     const int64_t start_idx = context.Attr<int64_t>("start_index");
     size_t N = table_t->dims()[0];
     size_t D = table_t->dims()[1];
@@ -142,8 +141,7 @@ template <typename T>
 class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    const auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto &dev_ctx = context.template device_context<phi::GPUContext>();
     const int64_t start_idx = context.Attr<int64_t>("start_index");
     auto ids_t = context.Input<LoDTensor>("Ids");
     auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index f2e6cdbe2ca..dae4fa497f7 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -312,7 +312,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 33617d8787d..354c31c213b 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -55,7 +55,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index b7e6262b81e..42d9ed2342c 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -61,7 +61,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index de83bc773ba..ef7e298aaf6 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -108,10 +108,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 
     const auto& place = ctx.GetPlace();
     const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // use global calculate stream
-    const auto stream = static_cast<platform::CUDADeviceContext*>(
+    const auto stream = static_cast<phi::GPUContext*>(
                             platform::DeviceContextPool::Instance().Get(place))
                             ->stream();
 
@@ -136,8 +136,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 
     // step 1, obtain logit_max
     Tensor logits_max;
-    logits_max =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* logits_max_buff = logits_max.mutable_data<T>(place);
 
     auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
@@ -166,7 +165,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     // step 3, obtain predict target
     Tensor predicted_logits;
     predicted_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
 
     auto t = framework::EigenVector<T>::Flatten(predicted_logits);
@@ -217,8 +216,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 
     // step 5, obtain sum_exp_logits
     Tensor sum_exp_logits;
-    sum_exp_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
     auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
@@ -262,7 +260,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     const int rank = ctx.Attr<int>("rank");
 
     const auto& place = ctx.GetPlace();
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     distributed::ProcessGroup* pg = map->get(rid);
@@ -290,8 +288,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     // step 1, obtain logit_max
     Tensor logits_max;
-    logits_max =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
 
     auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
     Eigen::DSizes<int, 1> along_axis(1);
@@ -314,7 +311,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     // step 3, obtain predict target
     Tensor predicted_logits;
     predicted_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+        ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     predicted_logits.mutable_data<T>(place);
 
     auto t = framework::EigenVector<T>::Flatten(predicted_logits);
@@ -358,8 +355,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     // step 5, obtain sum_exp_logits
     Tensor sum_exp_logits;
-    sum_exp_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
 
     auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
@@ -395,8 +391,7 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Output<Tensor>(framework::GradVarName("Logits"));
     const Tensor* softmax = context.Input<Tensor>("Softmax");
     const int rank = context.Attr<int>("rank");
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     if (logit_grad != softmax) {
       framework::TensorCopy(
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index 06c251e32cf..5b34e4ba9d5 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -83,7 +83,7 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
                           rank,
                           nranks));
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto dims = x->dims();
     auto dims_size = dims.size();
     // final dim
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index f800be642f7..5b26e47a8fd 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,7 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 
     auto place = ctx.GetPlace();
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto dev_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(place));
 
     platform::GpuStreamSync(dev_ctx->stream());
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index 4a60f255b47..bacbe014a34 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase {
     int ring_id = Attr<int>("ring_id");
 
     auto compute_stream =
-        static_cast<platform::CUDADeviceContext*>(
+        static_cast<phi::GPUContext*>(
             platform::DeviceContextPool::Instance().Get(place))
             ->stream();
     auto comm_stream =
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index cec57c6bfd7..34569b0a4b6 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase {
     int ring_id = Attr<int>("ring_id");
 
     auto compute_stream =
-        static_cast<platform::CUDADeviceContext*>(
+        static_cast<phi::GPUContext*>(
             platform::DeviceContextPool::Instance().Get(place))
             ->stream();
     auto comm_stream =
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 2b8ba4049c5..3d7ab09f45e 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -83,7 +83,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index b8b260c74ce..1337901f185 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -82,7 +82,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 7e25f6876ad..6bc18254737 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -81,7 +81,7 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       gpuStream_t stream = nullptr;
       if (ctx.Attr<bool>("use_calc_stream")) {
         auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-        stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+        stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
       } else {
         stream = comm->stream();
       }
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index da6690a96a1..526f9425992 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -82,7 +82,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       if (ctx.Attr<bool>("use_calc_stream")) {
         auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-        stream = static_cast<platform::CUDADeviceContext *>(dev_ctx)->stream();
+        stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
       } else {
         stream = comm->stream();
       }
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 874bd61d198..84b1e7148df 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -78,7 +78,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       if (ctx.Attr<bool>("use_calc_stream")) {
         auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-        stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+        stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
       } else {
         stream = comm->stream();
       }
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 944644f4101..ec18a172e1f 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -158,7 +158,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext *>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 063eb5c1f82..37b18703031 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -153,7 +153,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
     if (ctx.Attr<bool>("use_calc_stream")) {
       auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+      stream = static_cast<phi::GPUContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 2e65e9f352d..89b703d8d1a 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -124,8 +124,7 @@ __global__ void ConvShiftDy(const T *x,
 }  // namespace
 
 template <typename T>
-class ConvShiftKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class ConvShiftKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const Tensor *X = context.Input<Tensor>("X");
@@ -146,8 +145,7 @@ class ConvShiftKernel<platform::CUDADeviceContext, T>
 
     dim3 grid_dim(num_x_blocks, batch_size);
 
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = context.template device_context<phi::GPUContext>().stream();
 
     ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
         x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
@@ -155,8 +153,7 @@ class ConvShiftKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T>
-class ConvShiftGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class ConvShiftGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const Tensor *X = context.Input<Tensor>("X");
@@ -174,9 +171,8 @@ class ConvShiftGradKernel<platform::CUDADeviceContext, T>
     int y_width = Y->dims()[1];
     int y_half_width = (y_width - 1) / 2;
 
-    auto &device_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
+    auto &device_ctx = context.template device_context<phi::GPUContext>();
+    phi::funcs::SetConstant<phi::GPUContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
@@ -212,9 +208,7 @@ class ConvShiftGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift,
-    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    conv_shift_grad,
-    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(conv_shift,
+                        ops::ConvShiftKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(conv_shift_grad,
+                        ops::ConvShiftGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 746debe21e5..3205d5b3538 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -44,7 +44,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Input");
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
     if (use_cudnn) {
       library_ = framework::LibraryType::kCUDNN;
@@ -348,7 +348,7 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
   use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
   }
 #endif
@@ -435,7 +435,7 @@ framework::OpKernelType ConvTransposeOpDoubleGrad::GetExpectedKernelType(
   use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(ctx.GetPlace())) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
   }
 #endif
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index e753b148fe1..3172625681a 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -132,7 +132,7 @@ void Compare2(f::Scope* scope,
 #ifdef PADDLE_WITH_CUDA
 TEST(copy_cross_scope, CUDA_fp32) {
   f::Scope scope;
-  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  phi::GPUContext ctx(p::CUDAPlace(0));
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(p::CUDAPlace(0), ctx.stream())
                        .get());
@@ -142,7 +142,7 @@ TEST(copy_cross_scope, CUDA_fp32) {
 
 TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
   f::Scope scope;
-  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  phi::GPUContext ctx(p::CUDAPlace(0));
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(p::CUDAPlace(0), ctx.stream())
                        .get());
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index d632de3ac86..434506c033c 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -176,7 +176,7 @@ __global__ void correlation_forward(T *output,
   }
 }
 
-// class CorrelationKernel<platform::CUDADeviceContext, T>
+// class CorrelationKernel<phi::GPUContext, T>
 template <typename T>
 class CorrelationCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -197,7 +197,7 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
 
     auto *output = ctx.Output<Tensor>("Output");
     output->mutable_data<T>(ctx.GetPlace());
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // base on input1, NCHW
     auto in_dims = input1->dims();
@@ -209,11 +209,11 @@ class CorrelationCUDAKernel : public framework::OpKernel<T> {
     int padded_input_height = H + 2 * pad_size;
     int padded_input_width = W + 2 * pad_size;
 
-    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput1.mutable_data<T>(ctx.GetPlace());
 
-    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput2.mutable_data<T>(ctx.GetPlace());
 
@@ -468,7 +468,7 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     grad_input1->mutable_data<T>(ctx.GetPlace());
     auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
     grad_input2->mutable_data<T>(ctx.GetPlace());
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     auto in_dims = input1->dims();
     int N = in_dims[0];
@@ -479,11 +479,11 @@ class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
     int padded_input_height = H + 2 * pad_size;
     int padded_input_width = W + 2 * pad_size;
 
-    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput1.mutable_data<T>(ctx.GetPlace());
 
-    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, phi::GPUContext>(
         {N, padded_input_height, padded_input_width, C}, dev_ctx);
     rinput2.mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu
index 3d144ca29d9..5599a9b19b0 100644
--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
@@ -14,8 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cos_sim_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    cos_sim_grad,
-    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(cos_sim, ops::CosSimKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(cos_sim_grad,
+                        ops::CosSimGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index bdc1f61fbe0..41e9d673d3f 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -230,11 +230,9 @@ REGISTER_OP_CPU_KERNEL(crop_grad,
                        ops::CropGradKernel<phi::CPUContext, float>,
                        ops::CropGradKernel<phi::CPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    crop,
-    ops::CropKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_grad,
-    ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(crop,
+                        ops::CropKernel<phi::GPUContext, float>,
+                        ops::CropKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(crop_grad,
+                        ops::CropGradKernel<phi::GPUContext, float>,
+                        ops::CropGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index cabe21919a9..2557532a940 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
-using CUDACtx = paddle::platform::CUDADeviceContext;
+using CUDACtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                         ops::CrossEntropyOpKernel<CUDACtx, float>,
                         ops::CrossEntropyOpKernel<CUDACtx, double>,
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 25d2c4e77d1..2095b3d3858 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -154,10 +154,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       if (host_out_lod0.back() == 0) {
         output->Resize({1, 1});
         output->mutable_data<T>(ctx.GetPlace());
-        phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
-        set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
-                     output,
-                     -1);
+        phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
+        set_constant(
+            ctx.template device_context<phi::GPUContext>(), output, -1);
       }
     }
   }
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index bf3009e1fe2..d53333d2176 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -247,7 +247,7 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
       SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
     }
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
 
     int seq_length = x->dims()[0];
@@ -262,9 +262,9 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int weight_numel;
     bool w_initialized = false;
     auto place = ctx.GetPlace();
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
-                      ctx.device_context())
-                      .stream();
+    auto stream =
+        reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
+            .stream();
     if (is_test && ctx.HasInput("W")) {
       auto *W = ctx.Input<Tensor>("W");
       w_initialized = W->IsInitialized() ? true : false;
@@ -460,7 +460,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
         framework::GradVarName("WeightList"));
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
 
     auto input_dims = input->dims();
@@ -479,9 +479,9 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     bool continuous =
         is_continuous<T, std::vector<const Tensor *>>(weight_list);
 
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
-                      ctx.device_context())
-                      .stream();
+    auto stream =
+        reinterpret_cast<const phi::GPUContext &>(ctx.device_context())
+            .stream();
     Tensor weight_whole;
     T *weight_data = nullptr;
 
@@ -494,7 +494,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     }
 
     Tensor weight_grad;
-    phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
     T *weight_grad_data = weight_grad.data<T>();
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index a5c3b51d300..d08d9e14ef0 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -99,8 +99,7 @@ class CVMCUDAKernel : public framework::OpKernel<T> {
     T* y_data = y->mutable_data<T>(context.GetPlace());
 
     // for Input X do not have Lod Information.
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = context.template device_context<phi::GPUContext>().stream();
     if (x->NumLevels() == 0) {
       CvmComputeKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                              PADDLE_CUDA_NUM_THREADS,
@@ -147,8 +146,7 @@ class CVMGradCUDAKernel : public framework::OpKernel<T> {
     auto item_size = dx_numel / batch_size;
 
     // for Input X do not have Lod Information.
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = context.template device_context<phi::GPUContext>().stream();
     if (dx->NumLevels() == 0) {
       CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) /
                                  PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index f87c88b2aaf..e3f510e755b 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -104,8 +104,7 @@ __global__ void KernelUpdateParam(int C,
 }
 
 template <typename T>
-class DataNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -130,8 +129,7 @@ class DataNormKernel<platform::CUDADeviceContext, T>
     T *scale_out_data =
         ctx.Output<Tensor>("Scales")->mutable_data<T>(ctx.GetPlace());
 
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = ctx.template device_context<phi::GPUContext>().stream();
 
     KernelMeanScale<<<GET_BLOCKS(C), PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
         C,
@@ -146,8 +144,7 @@ class DataNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T>
-class DataNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class DataNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
@@ -180,8 +177,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
         ctx.Output<Tensor>(framework::GradVarName("BatchSquareSum"))
             ->mutable_data<T>(ctx.GetPlace());
 
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = ctx.template device_context<phi::GPUContext>().stream();
     if (d_x != nullptr) {
       KernelDataNormBP<<<GET_BLOCKS(C * N),
                          PADDLE_CUDA_NUM_THREADS,
@@ -259,11 +255,9 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    data_norm,
-    ops::DataNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DataNormKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    data_norm_grad,
-    ops::DataNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DataNormGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(data_norm,
+                        ops::DataNormKernel<phi::GPUContext, float>,
+                        ops::DataNormKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(data_norm_grad,
+                        ops::DataNormGradKernel<phi::GPUContext, float>,
+                        ops::DataNormGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 18a38a0471d..d974a60197d 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -624,7 +624,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling,
                         ops::DeformablePSROIPoolCUDAKernel<CUDA, float>,
                         ops::DeformablePSROIPoolCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cu b/paddle/fluid/operators/dequantize_abs_max_op.cu
index 964f740a03f..57d2c02adb0 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cu
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cu
@@ -27,8 +27,8 @@ __global__ void KeDequantize(
 }
 
 template <typename T>
-struct DequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* scale,
                   float max_range,
@@ -46,14 +46,14 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CUDADeviceContext, int8_t>;
-template struct DequantizeFunctor<platform::CUDADeviceContext, int16_t>;
+template struct DequantizeFunctor<phi::GPUContext, int8_t>;
+template struct DequantizeFunctor<phi::GPUContext, int16_t>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(dequantize_abs_max,
                         ops::DequantizeMaxAbsKernel<CUDA, int8_t>,
                         ops::DequantizeMaxAbsKernel<CUDA, int16_t>);
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index dc4e03a858f..2c47d9b17aa 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -36,8 +36,8 @@ __global__ void KeDequantize(const T* in,
 }
 
 template <typename T>
-struct DequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* dict,
                   framework::Tensor* out) {
@@ -54,11 +54,11 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CUDADeviceContext, int8_t>;
+template struct DequantizeFunctor<phi::GPUContext, int8_t>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(dequantize_log, ops::DequantizeLogKernel<CUDA, int8_t>);
diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h
index 82678d456c3..4c729a65f59 100644
--- a/paddle/fluid/operators/detail/strided_memcpy.h
+++ b/paddle/fluid/operators/detail/strided_memcpy.h
@@ -39,8 +39,7 @@ struct StridedMemcpyFunctor<T, 0> {
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory::Copy(
           gpu_place, dst, gpu_place, src, sizeof(T), cuda_ctx.stream());
 #else
@@ -66,8 +65,7 @@ struct StridedMemcpyFunctor<T, 1> {
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory::Copy(gpu_place,
                    dst,
                    gpu_place,
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index 036a33cff8e..30250eb8cc0 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -96,8 +96,7 @@ class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     int grid = (box_num + block - 1) / block;
 
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = ctx.template device_context<phi::GPUContext>().stream();
 
     anchors->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index e41f4e9b3b7..90be767e2f2 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -47,14 +47,14 @@ struct RangeInitFunctor {
 };
 
 template <typename T>
-static void SortDescending(const platform::CUDADeviceContext &ctx,
+static void SortDescending(const phi::GPUContext &ctx,
                            const Tensor &value,
                            Tensor *value_out,
                            Tensor *index_out) {
   int num = static_cast<int>(value.numel());
   Tensor index_in_t;
   int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
-  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
+  platform::ForRange<phi::GPUContext> for_range(ctx, num);
   for_range(RangeInitFunctor{0, 1, idx_in});
 
   int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
@@ -287,7 +287,7 @@ static __global__ void NMSKernel(const int n_boxes,
 }
 
 template <typename T>
-static void NMS(const platform::CUDADeviceContext &ctx,
+static void NMS(const phi::GPUContext &ctx,
                 const Tensor &proposals,
                 const Tensor &sorted_indices,
                 const T nms_threshold,
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 084faf32e6b..87dc4a30abb 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -75,7 +75,6 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    box_clip,
-    ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUBoxClipKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(box_clip,
+                        ops::GPUBoxClipKernel<phi::GPUContext, float>,
+                        ops::GPUBoxClipKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index 7f66cb86b56..f87a636bdfb 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -152,7 +152,5 @@ class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     box_decoder_and_assign,
-    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
+    ops::BoxDecoderAndAssignCUDAKernel<phi::GPUContext, float>,
+    ops::BoxDecoderAndAssignCUDAKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 01346c94fa6..0fbc54d3135 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -138,8 +138,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor index_in_t;
     int* idx_in =
         index_in_t.mutable_data<int>({total_roi_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range_total(
-        dev_ctx, total_roi_num);
+    platform::ForRange<phi::GPUContext> for_range_total(dev_ctx, total_roi_num);
     for_range_total(RangeInitFunctor{0, 1, idx_in});
 
     Tensor keys_out_t;
@@ -188,8 +187,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor batch_index_t;
     int* batch_idx_in =
         batch_index_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range_post(
-        dev_ctx, real_post_num);
+    platform::ForRange<phi::GPUContext> for_range_post(dev_ctx, real_post_num);
     for_range_post(RangeInitFunctor{0, 1, batch_idx_in});
 
     Tensor out_id_t;
@@ -228,7 +226,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor length_lod;
     int* length_lod_data =
         length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, int> set_zero;
     set_zero(dev_ctx, &length_lod, static_cast<int>(0));
 
     int blocks = NumBlocks(real_post_num);
@@ -274,7 +272,5 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     collect_fpn_proposals,
-    ops::GPUCollectFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::GPUCollectFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
+    ops::GPUCollectFpnProposalsOpKernel<phi::GPUContext, float>,
+    ops::GPUCollectFpnProposalsOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu
index 8521b28127b..aa60d054546 100644
--- a/paddle/fluid/operators/detection/density_prior_box_op.cu
+++ b/paddle/fluid/operators/detection/density_prior_box_op.cu
@@ -164,8 +164,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     dim3 threads(blockx, 1);
     dim3 grids(gridx, feature_height);
 
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = ctx.template device_context<phi::GPUContext>().stream();
     GenDensityPriorBox<T><<<grids, threads, 0, stream>>>(feature_height,
                                                          feature_width,
                                                          img_height,
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 3fd309aee40..1063382ef33 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -129,7 +129,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor sub_lod_list;
     sub_lod_list.Resize({num_level, lod_size});
     int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, int> set_zero;
     set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));
 
     Tensor target_lvls;
@@ -155,7 +155,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     Tensor index_in_t;
     int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
+    platform::ForRange<phi::GPUContext> for_range(dev_ctx, roi_num);
     for_range(RangeInitFunctor{0, 1, idx_in});
 
     Tensor keys_out_t;
@@ -258,7 +258,5 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     distribute_fpn_proposals,
-    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>);
+    ops::GPUDistributeFpnProposalsOpKernel<phi::GPUContext, float>,
+    ops::GPUDistributeFpnProposalsOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 00ffeebc08b..ed1ad6da34d 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor;
 namespace {
 template <typename T>
 static std::pair<Tensor, Tensor> ProposalForOneImage(
-    const platform::CUDADeviceContext &ctx,
+    const phi::GPUContext &ctx,
     const Tensor &im_info,
     const Tensor &anchors,
     const Tensor &variances,
@@ -59,7 +59,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
 
   {
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
+    platform::ForRange<phi::GPUContext> for_range(ctx, pre_nms_num);
     for_range(BoxDecodeAndClipFunctor<T>{anchors.data<T>(),
                                          bbox_deltas.data<T>(),
                                          variances.data<T>(),
@@ -94,7 +94,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -266,5 +266,4 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     generate_proposals,
-    ops::CUDAGenerateProposalsKernel<paddle::platform::CUDADeviceContext,
-                                     float>);
+    ops::CUDAGenerateProposalsKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index d5005f435f2..682a9adf659 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -34,7 +34,7 @@ using LoDTensor = framework::LoDTensor;
 namespace {
 template <typename T>
 static std::pair<Tensor, Tensor> ProposalForOneImage(
-    const platform::CUDADeviceContext &ctx,
+    const phi::GPUContext &ctx,
     const Tensor &im_shape,
     const Tensor &anchors,
     const Tensor &variances,
@@ -60,7 +60,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
 
   {
-    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
+    platform::ForRange<phi::GPUContext> for_range(ctx, pre_nms_num);
     for_range(BoxDecodeAndClipFunctor<T>{anchors.data<T>(),
                                          bbox_deltas.data<T>(),
                                          variances.data<T>(),
@@ -98,7 +98,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -274,5 +274,4 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     generate_proposals_v2,
-    ops::CUDAGenerateProposalsV2Kernel<paddle::platform::CUDADeviceContext,
-                                       float>);
+    ops::CUDAGenerateProposalsV2Kernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
index 8342b4138c8..dc27f326538 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cu
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/iou_similarity_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(iou_similarity,
+                        ops::IOUSimilarityKernel<phi::GPUContext, float>,
+                        ops::IOUSimilarityKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
index 18088067147..1cdf7691338 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -149,8 +149,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     int grid = (box_num + block - 1) / block;
 
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = ctx.template device_context<phi::GPUContext>().stream();
 
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index 5bf68c154c6..73b28f8f0e4 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -382,7 +382,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     T* out2in_w_data =
         out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
 
-    phi::funcs::SetConstant<platform::CUDADeviceContext, int> init;
+    phi::funcs::SetConstant<phi::GPUContext, int> init;
     init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
 
     auto transformed_height = ctx.Attr<int>("transformed_height");
@@ -519,7 +519,7 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
 
     T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
 
     const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 9ca480ba727..3def90fd459 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -185,12 +185,9 @@ class GPUSigmoidFocalLossGradKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_focal_loss,
-    ops::GPUSigmoidFocalLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUSigmoidFocalLossKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
+    ops::GPUSigmoidFocalLossKernel<phi::GPUContext, float>,
+    ops::GPUSigmoidFocalLossKernel<phi::GPUContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sigmoid_focal_loss_grad,
-    ops::GPUSigmoidFocalLossGradKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::GPUSigmoidFocalLossGradKernel<paddle::platform::CUDADeviceContext,
-                                       double>);
+    ops::GPUSigmoidFocalLossGradKernel<phi::GPUContext, float>,
+    ops::GPUSigmoidFocalLossGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu
index 1ab698998c7..337f55a3ca8 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cu
+++ b/paddle/fluid/operators/detection/target_assign_op.cu
@@ -41,8 +41,8 @@ __global__ void NegTargetAssignKernel(const int* neg_indices,
 }
 
 template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
+struct NegTargetAssignFunctor<phi::GPUContext, T, WT> {
+  void operator()(const phi::GPUContext& ctx,
                   const int* neg_indices,
                   const size_t* lod,
                   const int N,
@@ -58,16 +58,13 @@ struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
   }
 };
 
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext,
-                                       float,
-                                       float>;
+template struct NegTargetAssignFunctor<phi::GPUContext, int, float>;
+template struct NegTargetAssignFunctor<phi::GPUContext, float, float>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(target_assign,
+                        ops::TargetAssignKernel<phi::GPUContext, int, float>,
+                        ops::TargetAssignKernel<phi::GPUContext, float, float>);
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
index e7f564b7ab4..9926d0e5436 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.cu
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu
@@ -15,6 +15,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/dgc_clip_by_norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    dgc_clip_by_norm,
-    ops::DGCClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(dgc_clip_by_norm,
+                        ops::DGCClipByNormKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu
index 0f0bf441a70..e8aa9b5245d 100644
--- a/paddle/fluid/operators/dgc_op.cu
+++ b/paddle/fluid/operators/dgc_op.cu
@@ -16,5 +16,4 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    dgc, ops::DGCOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(dgc, ops::DGCOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu
index c40206b0032..c9afc983b03 100644
--- a/paddle/fluid/operators/diag_op.cu
+++ b/paddle/fluid/operators/diag_op.cu
@@ -16,9 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    diag,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DiagKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(diag,
+                        ops::DiagKernel<phi::GPUContext, int>,
+                        ops::DiagKernel<phi::GPUContext, int64_t>,
+                        ops::DiagKernel<phi::GPUContext, float>,
+                        ops::DiagKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
index 3fa1b6ef08c..92e5d66776d 100644
--- a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope,
   auto dims = phi::make_ddim(shape);
   tensor->Resize(dims);
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   inference::tensorrt::RandomizeTensor(tensor, place, ctx);
 }
 
@@ -127,7 +127,7 @@ TEST(DlnneEngineOp, manual) {
 
   framework::Scope scope;
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   // Prepare variables.
   CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
   CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
@@ -145,7 +145,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
 
   auto* block_ = program.Proto()->add_blocks();
   block_->set_idx(0);
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 7733d202e57..d51c57d6eab 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -98,7 +98,7 @@ TEST(Dropout, CPUDense) {
 TEST(Dropout, GPUDense) {
   f::Scope scope;
   p::CUDAPlace place;
-  p::CUDADeviceContext ctx(place);
+  p::phi::GPUContext ctx(place);
   Compare(scope, ctx);
 }
 */
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 369fea2b0b1..681f91ffa68 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -85,9 +85,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
     auto batch_size = x1_t->dims()[0];
 
     auto normalized = ctx.Attr<bool>("normalized");
-    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
-                      ctx.device_context())
-                      .stream();
+    auto stream =
+        reinterpret_cast<const phi::GPUContext&>(ctx.device_context()).stream();
 
     framework::Vector<size_t> hyp_lod(batch_size + 1);
     framework::Vector<size_t> ref_lod(batch_size + 1);
@@ -124,8 +123,8 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
     }
 
     const size_t num_strs = hyp_lod.size() - 1;
-    phi::funcs::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
-    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+    phi::funcs::SetConstant<phi::GPUContext, int64_t> set_constant;
+    set_constant(ctx.template device_context<phi::GPUContext>(),
                  sequence_num,
                  static_cast<int64_t>(num_strs));
 
diff --git a/paddle/fluid/operators/eigvalsh_op.cu b/paddle/fluid/operators/eigvalsh_op.cu
index 3ed431f8002..880570d1be0 100644
--- a/paddle/fluid/operators/eigvalsh_op.cu
+++ b/paddle/fluid/operators/eigvalsh_op.cu
@@ -16,25 +16,23 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    eigvalsh,
-    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, float, float>,
-    ops::EigvalshKernel<paddle::platform::CUDADeviceContext, double, double>,
-    ops::EigvalshKernel<paddle::platform::CUDADeviceContext,
-                        float,
-                        paddle::platform::complex<float>>,
-    ops::EigvalshKernel<paddle::platform::CUDADeviceContext,
-                        double,
-                        paddle::platform::complex<double>>);
+REGISTER_OP_CUDA_KERNEL(eigvalsh,
+                        ops::EigvalshKernel<phi::GPUContext, float, float>,
+                        ops::EigvalshKernel<phi::GPUContext, double, double>,
+                        ops::EigvalshKernel<phi::GPUContext,
+                                            float,
+                                            paddle::platform::complex<float>>,
+                        ops::EigvalshKernel<phi::GPUContext,
+                                            double,
+                                            paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     eigvalsh_grad,
-    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext, float, float>,
-    ops::
-        EigvalshGradKernel<paddle::platform::CUDADeviceContext, double, double>,
-    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext,
+    ops::EigvalshGradKernel<phi::GPUContext, float, float>,
+    ops::EigvalshGradKernel<phi::GPUContext, double, double>,
+    ops::EigvalshGradKernel<phi::GPUContext,
                             float,
                             paddle::platform::complex<float>>,
-    ops::EigvalshGradKernel<paddle::platform::CUDADeviceContext,
+    ops::EigvalshGradKernel<phi::GPUContext,
                             double,
                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index b0b0db5cde4..f81b76aa487 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1596,7 +1596,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
 #if defined(__NVCC__) || defined(__HIPCC__)
 
 template <ElementwiseType ET, typename T, typename Functor>
-void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
+void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
                      const platform::Place &place,
                      int axis,
                      std::vector<const framework::Tensor *> ins,
@@ -1609,7 +1609,7 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx,
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
-void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx,
+void GetGradXOrYOut(const phi::GPUContext &dev_ctx,
                     const platform::Place &place,
                     int axis,
                     std::vector<const framework::Tensor *> ins,
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 6f1e04ebfa6..3d32c9b8a14 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -158,17 +158,15 @@ REGISTER_OP_CPU_KERNEL(expand_as_grad,
                        ops::ExpandAsGradKernel<phi::CPUContext, float>,
                        ops::ExpandAsGradKernel<phi::CPUContext, double>);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-REGISTER_OP_CUDA_KERNEL(
-    expand_as,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(expand_as,
+                        ops::ExpandAsKernel<phi::GPUContext, float>,
+                        ops::ExpandAsKernel<phi::GPUContext, double>,
+                        ops::ExpandAsKernel<phi::GPUContext, int>,
+                        ops::ExpandAsKernel<phi::GPUContext, int64_t>,
+                        ops::ExpandAsKernel<phi::GPUContext, bool>);
+REGISTER_OP_CUDA_KERNEL(expand_as_grad,
+                        ops::ExpandAsGradKernel<phi::GPUContext, int>,
+                        ops::ExpandAsGradKernel<phi::GPUContext, int64_t>,
+                        ops::ExpandAsGradKernel<phi::GPUContext, float>,
+                        ops::ExpandAsGradKernel<phi::GPUContext, double>);
 #endif
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index d8c66f95a13..1261b777701 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -294,19 +294,17 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::ExpandKernel<phi::GPUContext, float>,
+    ops::ExpandKernel<phi::GPUContext, double>,
+    ops::ExpandKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::ExpandKernel<phi::GPUContext, int>,
+    ops::ExpandKernel<phi::GPUContext, int64_t>,
+    ops::ExpandKernel<phi::GPUContext, bool>);
 REGISTER_OP_CUDA_KERNEL(
     expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ExpandGradKernel<phi::GPUContext, float>,
+    ops::ExpandGradKernel<phi::GPUContext, double>,
+    ops::ExpandGradKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::ExpandGradKernel<phi::GPUContext, int>,
+    ops::ExpandGradKernel<phi::GPUContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 5562baca97f..34855fbc96e 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 using float16 = paddle::platform::float16;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
                         ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu.h b/paddle/fluid/operators/fake_dequantize_op.cu.h
index 65dfad185c1..161b87ea392 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu.h
+++ b/paddle/fluid/operators/fake_dequantize_op.cu.h
@@ -31,8 +31,8 @@ __global__ void KeDequantize(
 }
 
 template <typename T>
-struct DequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+struct DequantizeFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* scale,
                   T max_range,
@@ -102,8 +102,8 @@ __global__ void DequantizeTwoScale(const T* in,
 }
 
 template <typename T>
-struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+struct ChannelDequantizeFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor** scales,
                   const int scale_num,
@@ -163,10 +163,10 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
-template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct DequantizeFunctor<phi::GPUContext, float>;
+template struct DequantizeFunctor<phi::GPUContext, double>;
+template struct ChannelDequantizeFunctor<phi::GPUContext, float>;
+template struct ChannelDequantizeFunctor<phi::GPUContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index c7ad664b7da..a19369fc6f2 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 using float16 = paddle::platform::float16;
 REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
                         ops::FakeQuantizeAbsMaxKernel<CUDA, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 3b1877f2bc8..22ba8254cdc 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -72,8 +72,8 @@ __global__ void FindAbsMaxKernel(const T *in, const int n, T *out) {
 }
 
 template <typename T>
-struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct FindAbsMaxFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const T *in,
                   const int num,
                   T *out) {
@@ -90,9 +90,8 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
-template struct FindAbsMaxFunctor<platform::CUDADeviceContext,
-                                  paddle::platform::float16>;
+template struct FindAbsMaxFunctor<phi::GPUContext, float>;
+template struct FindAbsMaxFunctor<phi::GPUContext, paddle::platform::float16>;
 
 template <typename T>
 __global__ void FindChannelAbsMaxKernelQuantAxis0(const T *in,
@@ -164,8 +163,8 @@ __global__ void FindChannelAbsMaxKernelQuantAxis1(
 }
 
 template <typename T>
-struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in_tensor,
                   const int quant_axis,
                   T *out_abs_max) {
@@ -215,7 +214,7 @@ struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template struct FindChannelAbsMaxFunctor<phi::GPUContext, float>;
 
 template <typename T>
 __global__ void ClipAndQuantKernel(const T *in,
@@ -289,8 +288,8 @@ __global__ void ClipAndQuantDequantKernel(const T *in,
 }
 
 template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct ClipAndFakeQuantFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -309,11 +308,11 @@ struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+template struct ClipAndFakeQuantFunctor<phi::GPUContext, float>;
 
 template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct ClipAndFakeQuantDequantFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -408,8 +407,8 @@ __global__ void ChannelClipAndQuantKernelQuantAxisN(const T *in,
 }
 
 template <typename T>
-struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct ChannelClipAndFakeQuantFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -462,8 +461,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
-                                               float>;
+template struct ChannelClipAndFakeQuantFunctor<phi::GPUContext, float>;
 
 template <typename T>
 __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale,
@@ -491,8 +489,8 @@ __global__ void FindRangeAbsMaxAndFillArray(const T *cur_scale,
 }
 
 template <typename T>
-struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct FindRangeAbsMaxFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &cur_scale,
                   const framework::Tensor &last_scale,
                   const framework::Tensor &iter,
@@ -535,7 +533,7 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
                    sizeof(int),
                    ctx.stream());
       ctx.Wait();
-      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(
+      FindAbsMaxFunctor<phi::GPUContext, T>()(
           ctx, scale_arr, len, out_scale_data);
     }
   }
@@ -556,11 +554,11 @@ __global__ void FindMovingAverageAbsMaxKernel(const T *in_state,
   *out_scale = accum / state;
 }
 
-template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template struct FindRangeAbsMaxFunctor<phi::GPUContext, float>;
 
 template <typename T>
-struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct FindMovingAverageAbsMaxFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in_accum,
                   const framework::Tensor &in_state,
                   const T *cur_scale,
@@ -660,8 +658,8 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
 }
 
 template <typename T>
-struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &ctx,
+struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &ctx,
                   const framework::Tensor &in,
                   const framework::Tensor &scale,
                   const int bin_cnt,
@@ -712,8 +710,7 @@ struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
-                                                   float>;
+template struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, float>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fc_op.cu.cc b/paddle/fluid/operators/fc_op.cu.cc
index 4147903551d..35c55135f21 100644
--- a/paddle/fluid/operators/fc_op.cu.cc
+++ b/paddle/fluid/operators/fc_op.cu.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fc,
-    ops::FCOpKernel<paddle::platform::CUDADeviceContext, phi::dtype::float16>,
-    ops::FCOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FCOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fc,
+                        ops::FCOpKernel<phi::GPUContext, phi::dtype::float16>,
+                        ops::FCOpKernel<phi::GPUContext, float>,
+                        ops::FCOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 93fb678e211..43776e98a02 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -42,7 +42,7 @@ void GetLinearOp(const std::vector<T> &x,
                  const std::vector<T> &y,
                  const framework::DDim &x_dim,
                  const framework::DDim &y_dim,
-                 const platform::CUDADeviceContext &ctx,
+                 const phi::GPUContext &ctx,
                  bool transpose_a,
                  bool transpose_b,
                  float alpha,
@@ -87,7 +87,7 @@ void GetElementwiseAddOp(const std::vector<T> &x,
                          const std::vector<T> &y,
                          const int bsz_seq,
                          const int output_size,
-                         const platform::CUDADeviceContext &ctx,
+                         const phi::GPUContext &ctx,
                          std::vector<T> *out) {
   framework::Scope scope;
   auto var_x = scope.Var("X");
@@ -128,7 +128,7 @@ void GetLinearOpGrad(const std::vector<T> &x_vec,
                      const framework::DDim &x_dim,
                      const framework::DDim &y_dim,
                      const framework::DDim &out_dim,
-                     const platform::CUDADeviceContext &ctx,
+                     const phi::GPUContext &ctx,
                      bool transpose_a,
                      bool transpose_b,
                      float alpha,
@@ -218,7 +218,7 @@ template <typename T>
 void GetElementwiseAddOpGrad(const std::vector<T> &dout_vec,
                              const int bsz_seq,
                              const int output_size,
-                             const platform::CUDADeviceContext &ctx,
+                             const phi::GPUContext &ctx,
                              std::vector<T> *dy_vec) {
   framework::Scope scope;
   auto var_x = scope.Var("X");
@@ -308,7 +308,7 @@ class TestFeedForward {
     bsz_seq_ = batch_size_ * seq_len_;
     output_size_ = 3 * num_head_ * dim_head_;
     input_size_ = dim_embed_;
-    ctx_ = new platform::CUDADeviceContext(place_);
+    ctx_ = new phi::GPUContext(place_);
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(place_, ctx_->stream())
                            .get());
@@ -559,7 +559,7 @@ class TestFeedForward {
   std::vector<T> base_dinput_vec_, base_dweight_vec_, base_dbias_vec_;
 
   platform::CUDAPlace place_;
-  platform::CUDADeviceContext *ctx_;
+  phi::GPUContext *ctx_;
 };
 
 // test for fp32, fp16, fp32+bias and fp16+bias
diff --git a/paddle/fluid/operators/fill_any_op.cu.cc b/paddle/fluid/operators/fill_any_op.cu.cc
index ca1726508c4..2a561e6d350 100644
--- a/paddle/fluid/operators/fill_any_op.cu.cc
+++ b/paddle/fluid/operators/fill_any_op.cu.cc
@@ -17,20 +17,18 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     fill_any,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::FillAnyKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillAnyKernel<phi::GPUContext, float>,
+    ops::FillAnyKernel<phi::GPUContext, double>,
+    ops::FillAnyKernel<phi::GPUContext, int64_t>,
+    ops::FillAnyKernel<phi::GPUContext, int>,
+    ops::FillAnyKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::FillAnyKernel<phi::GPUContext, bool>);
 
 REGISTER_OP_CUDA_KERNEL(
     fill_any_grad,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::float16>,
-    ops::FillAnyGradKernel<paddle::platform::CUDADeviceContext, bool>);
+    ops::FillAnyGradKernel<phi::GPUContext, float>,
+    ops::FillAnyGradKernel<phi::GPUContext, double>,
+    ops::FillAnyGradKernel<phi::GPUContext, int64_t>,
+    ops::FillAnyGradKernel<phi::GPUContext, int>,
+    ops::FillAnyGradKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::FillAnyGradKernel<phi::GPUContext, bool>);
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 8e51c203d41..bd8303fe402 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -133,9 +133,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(ctx.GetPlace(),
                            framework::TransToPhiDataType(data_type));
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
+      phi::funcs::SetConstant<phi::GPUContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
+      functor(reinterpret_cast<const phi::GPUContext &>(dev_ctx),
               tensor,
               static_cast<T>(value));
 #else
diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu
index 8e30e0833d6..105b207636c 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_op.cu
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
 __global__ void fill_constant_kernel(const int64_t featuresize,
diff --git a/paddle/fluid/operators/fill_diagonal_tensor_op.cu b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
index a7c26caa8fb..1b6ab71386b 100644
--- a/paddle/fluid/operators/fill_diagonal_tensor_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_tensor_op.cu
@@ -18,7 +18,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
 __global__ void fill_diagonal_tensor_kernel(int64_t size,
@@ -109,7 +108,7 @@ class FillDiagonalTensorCUDAKernel : public framework::OpKernel<T> {
 
     auto size = out->numel();
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     Tensor tensor_tmp;
     int64_t *memory_block_cu =
@@ -175,8 +174,7 @@ class FillDiagonalTensorGradCUDAKernel : public framework::OpKernel<T> {
 
       auto size = dx->numel();
 
-      auto &dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
       auto stream = dev_ctx.stream();
       Tensor tensor_tmp;
       int64_t *memory_block_cu =
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cu.cc b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
index 91809b8cd11..fad1bba49f3 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cu.cc
@@ -21,28 +21,24 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fill_zeros_like,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+    ops::FillZerosLikeKernel<phi::GPUContext, int>,
+    ops::FillZerosLikeKernel<phi::GPUContext, int64_t>,
+    ops::FillZerosLikeKernel<phi::GPUContext, float>,
+    ops::FillZerosLikeKernel<phi::GPUContext, double>,
+    ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::FillZerosLikeKernel<phi::GPUContext, bool>,
+    ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<phi::GPUContext,
                              paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     fill_zeros_like2,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext,
+    ops::FillZerosLikeKernel<phi::GPUContext, int>,
+    ops::FillZerosLikeKernel<phi::GPUContext, int64_t>,
+    ops::FillZerosLikeKernel<phi::GPUContext, float>,
+    ops::FillZerosLikeKernel<phi::GPUContext, double>,
+    ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::FillZerosLikeKernel<phi::GPUContext, bool>,
+    ops::FillZerosLikeKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::FillZerosLikeKernel<phi::GPUContext,
                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
index e287ce1515a..0a055c688ee 100644
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -17,35 +17,31 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    flatten,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::FlattenKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten_grad,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten2,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    flatten2_grad,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(flatten,
+                        ops::FlattenKernel<phi::GPUContext, float>,
+                        ops::FlattenKernel<phi::GPUContext, double>,
+                        ops::FlattenKernel<phi::GPUContext, uint8_t>,
+                        ops::FlattenKernel<phi::GPUContext, int>,
+                        ops::FlattenKernel<phi::GPUContext, int8_t>,
+                        ops::FlattenKernel<phi::GPUContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(flatten_grad,
+                        ops::FlattenGradKernel<phi::GPUContext, float>,
+                        ops::FlattenGradKernel<phi::GPUContext, double>,
+                        ops::FlattenGradKernel<phi::GPUContext, uint8_t>,
+                        ops::FlattenGradKernel<phi::GPUContext, int>,
+                        ops::FlattenGradKernel<phi::GPUContext, int8_t>,
+                        ops::FlattenGradKernel<phi::GPUContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(flatten2,
+                        ops::Flatten2Kernel<phi::GPUContext, float>,
+                        ops::Flatten2Kernel<phi::GPUContext, double>,
+                        ops::Flatten2Kernel<phi::GPUContext, uint8_t>,
+                        ops::Flatten2Kernel<phi::GPUContext, int>,
+                        ops::Flatten2Kernel<phi::GPUContext, int8_t>,
+                        ops::Flatten2Kernel<phi::GPUContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(flatten2_grad,
+                        ops::Flatten2GradKernel<phi::GPUContext, float>,
+                        ops::Flatten2GradKernel<phi::GPUContext, double>,
+                        ops::Flatten2GradKernel<phi::GPUContext, uint8_t>,
+                        ops::Flatten2GradKernel<phi::GPUContext, int>,
+                        ops::Flatten2GradKernel<phi::GPUContext, int8_t>,
+                        ops::Flatten2GradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/fold_op.cu b/paddle/fluid/operators/fold_op.cu
index 2d2b334b021..7728d57a276 100644
--- a/paddle/fluid/operators/fold_op.cu
+++ b/paddle/fluid/operators/fold_op.cu
@@ -16,12 +16,10 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    fold,
-    ops::FoldOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FoldOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fold,
+                        ops::FoldOpKernel<phi::GPUContext, float>,
+                        ops::FoldOpKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fold_grad,
-    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FoldGradOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fold_grad,
+                        ops::FoldGradOpKernel<phi::GPUContext, float>,
+                        ops::FoldGradOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu
index a762054a1ea..d1931367307 100644
--- a/paddle/fluid/operators/fsp_op.cu
+++ b/paddle/fluid/operators/fsp_op.cu
@@ -18,8 +18,8 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(fsp,
-                        ops::FSPOpKernel<plat::CUDADeviceContext, float>,
-                        ops::FSPOpKernel<plat::CUDADeviceContext, double>);
+                        ops::FSPOpKernel<phi::GPUContext, float>,
+                        ops::FSPOpKernel<phi::GPUContext, double>);
 REGISTER_OP_CUDA_KERNEL(fsp_grad,
-                        ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
-                        ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
+                        ops::FSPGradOpKernel<phi::GPUContext, float>,
+                        ops::FSPGradOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h
index 3db4992bd29..baed3ca7a1a 100644
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -22,7 +22,7 @@ namespace operators {
 template <typename T>
 class AttnLayerNorm {
  public:
-  AttnLayerNorm(const platform::CUDADeviceContext& dev_ctx,
+  AttnLayerNorm(const phi::GPUContext& dev_ctx,
                 float epsilon,
                 int64_t batch_size,
                 int64_t feature_size)
@@ -82,7 +82,7 @@ class AttnLayerNorm {
   }
 
  private:
-  const platform::CUDADeviceContext& dev_ctx_;
+  const phi::GPUContext& dev_ctx_;
 
   int64_t batch_size_;
   int64_t feature_size_;
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 6dd6cc28139..fa50d5b23bf 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -95,7 +95,7 @@ __global__ void BroadcastKernelBinary(
 
 // bias add forward impl for "[m, n] + [n] = [m, n]"
 template <typename T>
-void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx,
+void LaunchBiasAddFwKernel(const phi::GPUContext& ctx,
                            int m,
                            int n,
                            const T* in0,
@@ -302,7 +302,7 @@ __global__ void BiasAddBw1DReduceKernel(const ReduceParamType<T>* temp_sum,
 }
 
 template <typename T>
-void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx,
+void Launch2DColumnReduce(const phi::GPUContext& dev_ctx,
                           const int max_threads,
                           const int reduce_num,
                           const int left_num,
@@ -345,11 +345,8 @@ void Launch2DColumnReduce(const platform::CUDADeviceContext& dev_ctx,
 // input
 // and d_bias[n] as output.
 template <typename T>
-void LaunchBiasAddBwKernel(const platform::CUDADeviceContext& dev_ctx,
-                           int m,
-                           int n,
-                           const T* d_out,
-                           T* d_bias) {
+void LaunchBiasAddBwKernel(
+    const phi::GPUContext& dev_ctx, int m, int n, const T* d_out, T* d_bias) {
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   int reduce_num = m;
   int left_num = n;
diff --git a/paddle/fluid/operators/fused/attn_feed_forward.h b/paddle/fluid/operators/fused/attn_feed_forward.h
index 568c283d3e4..753eb447108 100644
--- a/paddle/fluid/operators/fused/attn_feed_forward.h
+++ b/paddle/fluid/operators/fused/attn_feed_forward.h
@@ -24,7 +24,7 @@ namespace operators {
 template <typename T>
 class FeedForward {
  public:
-  FeedForward(const platform::CUDADeviceContext& dev_ctx,
+  FeedForward(const phi::GPUContext& dev_ctx,
               int bsz_seq,
               int output_size,
               int input_size,
@@ -53,7 +53,7 @@ class FeedForward {
 
     // column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out)
     // here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out)
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     blas.GEMM(transA,
               transB,
               bsz_seq_,
@@ -78,7 +78,7 @@ class FeedForward {
       T* input, T* weight, T* d_output, T* d_input, T* d_weight, T* d_bias) {
     T alpha = static_cast<T>(1.0);
     T beta = static_cast<T>(0.0);
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
 
     // column-major: gemm-nt, get d_weight.
     CBLAS_TRANSPOSE transA = CblasTrans;
@@ -116,7 +116,7 @@ class FeedForward {
   }
 
  private:
-  const platform::CUDADeviceContext& dev_ctx_;
+  const phi::GPUContext& dev_ctx_;
   int bsz_seq_, output_size_, input_size_;
   bool compute_bias_;
 };
diff --git a/paddle/fluid/operators/fused/attn_gemm.h b/paddle/fluid/operators/fused/attn_gemm.h
index 9adfe8e088d..07947f522cd 100644
--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -30,7 +30,7 @@ template <typename T>
 class AttnMatMul {
  public:
   // (m, n, k) = bsz_seq, output_size, input_size
-  AttnMatMul(const platform::CUDADeviceContext& dev_ctx,
+  AttnMatMul(const phi::GPUContext& dev_ctx,
              bool transA,
              bool transB,
              int bsz_seq,
@@ -60,7 +60,7 @@ class AttnMatMul {
     T beta = static_cast<T>(0.0);
 
     // (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     blas.GEMM(transA,
               transB,
               bsz_seq_,
@@ -91,7 +91,7 @@ class AttnMatMul {
     T beta_dA = use_addto ? static_cast<T>(1.0) : static_cast<T>(0.0);
     T beta_dB = static_cast<T>(0.0);
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     if (!transA_) {
       // forward: gemm-nt
       if (transB_) {
@@ -223,7 +223,7 @@ class AttnMatMul {
   }
 
  private:
-  const platform::CUDADeviceContext& dev_ctx_;
+  const phi::GPUContext& dev_ctx_;
 
   bool transA_;
   bool transB_;
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 121cbc909b8..81e8c573266 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -43,7 +43,7 @@ template <typename T>
 class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto* input = ctx.Input<Tensor>("Input");
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -109,17 +109,15 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       }
       framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec));
       transformed_input.Resize(new_input_shape);
-      auto& dev_ctx =
-          ctx.template device_context<paddle::platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
       transformed_input =
-          ctx.AllocateTmpTensor<T, paddle::platform::CUDADeviceContext>(
-              new_input_shape, dev_ctx);
+          ctx.AllocateTmpTensor<T, phi::GPUContext>(new_input_shape, dev_ctx);
       const int rank = transformed_input_channel.dims().size();
       T pad_value(0.0);
       switch (rank) {
         case 4: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 4>(
+          phi::funcs::PadFunction<phi::GPUContext, T, 4>(
               dev_ctx,
               input_pad,
               transformed_input_channel,
@@ -127,7 +125,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               &transformed_input);
         } break;
         case 5: {
-          phi::funcs::PadFunction<paddle::platform::CUDADeviceContext, T, 5>(
+          phi::funcs::PadFunction<phi::GPUContext, T, 5>(
               dev_ctx,
               input_pad,
               transformed_input_channel,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 0bda60f6b8b..e11792a5dfb 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -149,7 +149,7 @@ void ComputeInplaceRelu(framework::Tensor *cpu_x) {
   }
 }
 
-void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
+void ComputeBatchNormForward(const phi::GPUContext &ctx,
                              const Tensor &cpu_x,
                              const Tensor &cpu_scale,
                              const Tensor &cpu_bias,
@@ -216,7 +216,7 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
   saved_reserve_space->ShareDataWith(*reserve_space);
 }
 
-void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
+void ComputeFusedBNAddReluForward(const phi::GPUContext &ctx,
                                   const Tensor &cpu_x,
                                   const Tensor &cpu_z,
                                   const Tensor &cpu_scale,
@@ -280,7 +280,7 @@ void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
   saved_reserve_space->ShareDataWith(*reserve_space);
 }
 
-void ComputeFusedBNAddReluBackward(const platform::CUDADeviceContext &ctx,
+void ComputeFusedBNAddReluBackward(const phi::GPUContext &ctx,
                                    const Tensor &cpu_dy,
                                    const Tensor &cpu_x,
                                    const Tensor &cpu_scale,
@@ -384,10 +384,8 @@ class CudnnBNAddReluTester {
               << ", is_relative_atol=" << is_relative_atol
               << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_
               << ", has_shortcut=" << has_shortcut_;
-    platform::CUDADeviceContext *ctx =
-        static_cast<platform::CUDADeviceContext *>(
-            platform::DeviceContextPool::Instance().Get(
-                platform::CUDAPlace(0)));
+    phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
     auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
 
@@ -469,10 +467,8 @@ class CudnnBNAddReluTester {
   }
 
   void CheckBackward(float diff, bool is_relative_atol = false) {
-    platform::CUDADeviceContext *ctx =
-        static_cast<platform::CUDADeviceContext *>(
-            platform::DeviceContextPool::Instance().Get(
-                platform::CUDAPlace(0)));
+    phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
     framework::Tensor cpu_dx_base;
     framework::Tensor cpu_dz_base;
@@ -526,7 +522,7 @@ class CudnnBNAddReluTester {
         {channels_}, static_cast<float>(0.0f), cpu_saved_var);
   }
 
-  void BaselineForward(const platform::CUDADeviceContext &ctx,
+  void BaselineForward(const phi::GPUContext &ctx,
                        Tensor *cpu_mean_x,
                        Tensor *cpu_var_x,
                        Tensor *cpu_saved_mean_x,
@@ -573,7 +569,7 @@ class CudnnBNAddReluTester {
     }
   }
 
-  void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
+  void BaselineForwardFusedBNAddRelu(const phi::GPUContext &ctx,
                                      Tensor *cpu_mean,
                                      Tensor *cpu_var,
                                      Tensor *cpu_saved_mean,
@@ -594,7 +590,7 @@ class CudnnBNAddReluTester {
                                  saved_reserve_space);
   }
 
-  void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
+  void BaselineBackwardFusedBNAddRelu(const phi::GPUContext &ctx,
                                       Tensor *cpu_dx,
                                       Tensor *cpu_dz,
                                       Tensor *cpu_dscale,
@@ -614,7 +610,7 @@ class CudnnBNAddReluTester {
                                   cpu_dbias);
   }
 
-  void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx,
+  void ComputeFusedBNStatsFinalize(const phi::GPUContext &ctx,
                                    const Tensor &cpu_x,
                                    const Tensor &cpu_bn_scale,
                                    const Tensor &cpu_bn_bias,
@@ -671,7 +667,7 @@ class CudnnBNAddReluTester {
   }
 
   // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
-  void FusedForward(const platform::CUDADeviceContext &ctx,
+  void FusedForward(const phi::GPUContext &ctx,
                     Tensor *cpu_mean_x,
                     Tensor *cpu_var_x,
                     Tensor *cpu_saved_mean_x,
@@ -809,7 +805,7 @@ class CudnnBNAddReluTester {
   }
 
   // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
-  void FusedBackward(const platform::CUDADeviceContext &ctx,
+  void FusedBackward(const phi::GPUContext &ctx,
                      Tensor *cpu_dx,
                      Tensor *cpu_dz,
                      Tensor *cpu_dscale,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 719c2fe64e5..628642b9563 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -61,7 +61,7 @@ struct BNStatsFinalizeArgs {
 template <typename T>
 class CudnnBNStatsFinalize {
  public:
-  CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx,
+  CudnnBNStatsFinalize(const phi::GPUContext &ctx,
                        const std::vector<int> &param_shape)
       : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING),
         inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) {
@@ -69,7 +69,7 @@ class CudnnBNStatsFinalize {
   }
   ~CudnnBNStatsFinalize() {}
 
-  void Forward(const platform::CUDADeviceContext &ctx,
+  void Forward(const phi::GPUContext &ctx,
                const Tensor &sum,
                const Tensor &sum_of_squares,
                const Tensor &scale,
@@ -130,7 +130,7 @@ class CudnnBNStatsFinalize {
   }
 
  private:
-  void TrainInit(const platform::CUDADeviceContext &ctx) {
+  void TrainInit(const phi::GPUContext &ctx) {
     // Set constant_param for train op
     train_op_.SetOpConstParamAttr({CUDNN_PARAM_YSUM_PLACEHOLDER,
                                    CUDNN_PARAM_YSQSUM_PLACEHOLDER,
@@ -167,7 +167,7 @@ class CudnnBNStatsFinalize {
                                        &workspace_size_bytes);
   }
 
-  void InferenceInit(const platform::CUDADeviceContext &ctx) {
+  void InferenceInit(const phi::GPUContext &ctx) {
     // Set constant_param for inference op
     inference_op_.SetOpConstParamAttr({CUDNN_PARAM_BN_SCALE_PLACEHOLDER,
                                        CUDNN_PARAM_BN_BIAS_PLACEHOLDER,
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index 3c924ddd9d9..34cf677223c 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -37,7 +37,7 @@ struct NormConvolutionArgs {
     compute_type = platform::CudnnDataType<float>::type;
   }
 
-  void Set(const platform::CUDADeviceContext &ctx,
+  void Set(const phi::GPUContext &ctx,
            const std::vector<int> &input_shape,
            const std::vector<int> &filter_shape,
            const std::vector<int> &output_shape,
@@ -124,7 +124,7 @@ struct NormConvolutionArgs {
     conv_desc.set(dtype, paddings, strides, dilations, false, group);
   }
 
-  bool IsSupport(const platform::CUDADeviceContext &ctx,
+  bool IsSupport(const phi::GPUContext &ctx,
                  const std::vector<int> &filter_shape,
                  int stride,
                  int dilation,
@@ -167,7 +167,7 @@ struct NormConvolutionArgs {
 template <typename T>
 class CudnnNormConvolution {
  public:
-  CudnnNormConvolution(const platform::CUDADeviceContext &ctx,
+  CudnnNormConvolution(const phi::GPUContext &ctx,
                        const std::vector<int> &input_shape,
                        const std::vector<int> &filter_shape,
                        const std::vector<int> &output_shape,
@@ -186,7 +186,7 @@ class CudnnNormConvolution {
   }
   ~CudnnNormConvolution() {}
 
-  void Forward(const platform::CUDADeviceContext &ctx,
+  void Forward(const phi::GPUContext &ctx,
                const Tensor &input,
                const Tensor &filter,
                Tensor *output,
@@ -228,7 +228,7 @@ class CudnnNormConvolution {
   }
 
  private:
-  CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) {
+  CudnnFusionOp *GetForwardOp(const phi::GPUContext &ctx) {
     framework::AlgorithmsCache<CudnnFusionOp *> &cache =
         *(CudnnFusionOpCache::Instance().GetForward());
 
@@ -284,7 +284,7 @@ class CudnnNormConvolution {
 template <typename T>
 class CudnnNormConvolutionGrad {
  public:
-  CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx,
+  CudnnNormConvolutionGrad(const phi::GPUContext &ctx,
                            const std::vector<int> &input_shape,
                            const std::vector<int> &filter_shape,
                            const std::vector<int> &output_shape,
@@ -304,7 +304,7 @@ class CudnnNormConvolutionGrad {
   }
   ~CudnnNormConvolutionGrad() {}
 
-  void Backward(const platform::CUDADeviceContext &ctx,
+  void Backward(const phi::GPUContext &ctx,
                 const Tensor &input,
                 const Tensor &filter,
                 const Tensor &output_grad,
@@ -327,7 +327,7 @@ class CudnnNormConvolutionGrad {
   }
 
  private:
-  void BackwardFilter(const platform::CUDADeviceContext &ctx,
+  void BackwardFilter(const phi::GPUContext &ctx,
                       T *output_grad_ptr,
                       T *input_ptr,
                       T *filter_grad_ptr) {
@@ -355,7 +355,7 @@ class CudnnNormConvolutionGrad {
         workspace_size);
   }
 
-  void BackwardData(const platform::CUDADeviceContext &ctx,
+  void BackwardData(const phi::GPUContext &ctx,
                     T *output_grad_ptr,
                     T *filter_ptr,
                     T *input_grad_ptr,
@@ -387,7 +387,7 @@ class CudnnNormConvolutionGrad {
         workspace_size);
   }
 
-  CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) {
+  CudnnFusionOp *GetBackwardFilterOp(const phi::GPUContext &ctx) {
     framework::AlgorithmsCache<CudnnFusionOp *> &cache =
         *(CudnnFusionOpCache::Instance().GetBackward());
 
@@ -430,7 +430,7 @@ class CudnnNormConvolutionGrad {
     return wgrad_op;
   }
 
-  size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) {
+  size_t GetWorkspaceSizeBwdData(const phi::GPUContext &ctx) {
     size_t workspace_size = 0U;
     auto handle = ctx.cudnn_handle();
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 7d404e6b3ed..ef93612ffce 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -94,7 +94,7 @@ void CheckOutput(const framework::Tensor &cpu_res,
 }
 
 // Use Paddle conv2d op results as baseline
-void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
+void ComputeConv2DForward(const phi::GPUContext &ctx,
                           const Tensor &cpu_input,
                           const Tensor &cpu_filter,
                           Tensor *cpu_output,
@@ -130,7 +130,7 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
 }
 
 // Use Paddle conv2d_grad op results as baseline
-void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
+void ComputeConv2DBackward(const phi::GPUContext &ctx,
                            const Tensor &cpu_input,
                            const Tensor &cpu_filter,
                            const Tensor &cpu_output_grad,
@@ -242,10 +242,8 @@ class CudnnNormConvolutionTester {
   ~CudnnNormConvolutionTester() {}
 
   void CheckForward(float diff, bool is_relative_atol = false) {
-    platform::CUDADeviceContext *ctx =
-        static_cast<platform::CUDADeviceContext *>(
-            platform::DeviceContextPool::Instance().Get(
-                platform::CUDAPlace(0)));
+    phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
     framework::Tensor cpu_output_base;
     framework::Tensor cpu_sum_base;
@@ -266,10 +264,8 @@ class CudnnNormConvolutionTester {
   }
 
   void CheckBackward(float diff, bool is_relative_atol = false) {
-    platform::CUDADeviceContext *ctx =
-        static_cast<platform::CUDADeviceContext *>(
-            platform::DeviceContextPool::Instance().Get(
-                platform::CUDAPlace(0)));
+    phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
+        platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
     framework::Tensor cpu_input_grad_base;
     framework::Tensor cpu_filter_nchw_grad_base;
@@ -304,7 +300,7 @@ class CudnnNormConvolutionTester {
         &cpu_output_grad_);
   }
 
-  void BaselineForward(const platform::CUDADeviceContext &ctx,
+  void BaselineForward(const phi::GPUContext &ctx,
                        framework::Tensor *cpu_output_base,
                        framework::Tensor *cpu_sum_base,
                        framework::Tensor *cpu_sum_of_square_base) {
@@ -314,7 +310,7 @@ class CudnnNormConvolutionTester {
         *cpu_output_base, cpu_sum_base, cpu_sum_of_square_base);
   }
 
-  void BaselineBackward(const platform::CUDADeviceContext &ctx,
+  void BaselineBackward(const phi::GPUContext &ctx,
                         framework::Tensor *cpu_input_grad_base,
                         framework::Tensor *cpu_filter_grad_base) {
     ComputeConv2DBackward(ctx,
@@ -329,7 +325,7 @@ class CudnnNormConvolutionTester {
   }
 
   // get forward results of cudnn_norm_conv
-  void FusedForward(const platform::CUDADeviceContext &ctx,
+  void FusedForward(const phi::GPUContext &ctx,
                     framework::Tensor *cpu_output,
                     framework::Tensor *cpu_sum,
                     framework::Tensor *cpu_sum_of_square) {
@@ -367,7 +363,7 @@ class CudnnNormConvolutionTester {
         sum_of_square, platform::CPUPlace(), cpu_sum_of_square);
   }
 
-  void FusedBackward(const platform::CUDADeviceContext &ctx,
+  void FusedBackward(const phi::GPUContext &ctx,
                      framework::Tensor *cpu_input_grad,
                      framework::Tensor *cpu_filter_grad) {
     framework::Tensor input;
@@ -443,7 +439,7 @@ TEST(CudnnNormConvFp16, K1S1) {
                                                              output_channels,
                                                              kernel_size,
                                                              stride);
-  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
   if (ctx->GetComputeCapability() < 70) {
@@ -473,7 +469,7 @@ TEST(CudnnNormConvFp16, K3S1) {
                                                              output_channels,
                                                              kernel_size,
                                                              stride);
-  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
   if (ctx->GetComputeCapability() < 70) {
@@ -503,7 +499,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
                                                              output_channels,
                                                              kernel_size,
                                                              stride);
-  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
   if (ctx->GetComputeCapability() < 70) {
@@ -533,7 +529,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
                                                              output_channels,
                                                              kernel_size,
                                                              stride);
-  platform::CUDADeviceContext *ctx = static_cast<platform::CUDADeviceContext *>(
+  phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
   if (ctx->GetComputeCapability() <= 70) {
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 61e513e911a..b25605c6ca0 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -100,7 +100,7 @@ struct ScaleBiasAddReluArgs {
 template <typename T>
 class CudnnScaleBiasAddRelu {
  public:
-  CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx,
+  CudnnScaleBiasAddRelu(const phi::GPUContext &ctx,
                         const std::string &act_type,
                         bool fuse_add,
                         bool has_shortcut,
@@ -116,7 +116,7 @@ class CudnnScaleBiasAddRelu {
 
   ~CudnnScaleBiasAddRelu() {}
 
-  void Forward(const platform::CUDADeviceContext &ctx,
+  void Forward(const phi::GPUContext &ctx,
                const Tensor &x,
                const Tensor &x_scale,
                const Tensor &x_bias,
@@ -171,7 +171,7 @@ class CudnnScaleBiasAddRelu {
         fwd_workspace_byte_);
   }
 
-  void Backward(const platform::CUDADeviceContext &ctx,
+  void Backward(const phi::GPUContext &ctx,
                 const Tensor &dy,
                 const Tensor &x,
                 const Tensor &scale,
@@ -237,7 +237,7 @@ class CudnnScaleBiasAddRelu {
   }
 
  private:
-  void ForwardInit(const platform::CUDADeviceContext &ctx) {
+  void ForwardInit(const phi::GPUContext &ctx) {
     // Set constant_param
     fwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER,
                                  CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER,
@@ -285,7 +285,7 @@ class CudnnScaleBiasAddRelu {
                                 CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
   }
 
-  void BackwardInit(const platform::CUDADeviceContext &ctx) {
+  void BackwardInit(const phi::GPUContext &ctx) {
     // Set constant_param
     bwd_op_.SetOpConstParamAttr({CUDNN_PARAM_XDATA_PLACEHOLDER,
                                  CUDNN_PARAM_DYDATA_PLACEHOLDER,
diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h
index ef1befbb320..7de59dd9ee2 100644
--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -67,7 +67,7 @@ class AttnDropoutParam {
 template <typename T>
 class FMHARef {
  public:
-  FMHARef(const platform::CUDADeviceContext& dev_ctx,
+  FMHARef(const phi::GPUContext& dev_ctx,
           int64_t batch_size,
           int64_t seq_len,
           int64_t num_head,
@@ -146,7 +146,7 @@ class FMHARef {
     // q*k^t, batched_gemm
     CBLAS_TRANSPOSE transA = CblasNoTrans;
     CBLAS_TRANSPOSE transB = CblasTrans;
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     int gemm_batch_size = batch_size_ * num_head_;
     int gemm_m = seq_len_;
     int gemm_n = out_seq_len;
@@ -274,7 +274,7 @@ class FMHARef {
                        Tensor* transpose_2_out_grad_tensor,
                        Tensor* src_mask_grad_tensor,
                        Tensor* qkv_input_grad_tensor) {
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
     int k_size = q_size;
     int softmax_axis = -1;
@@ -479,7 +479,7 @@ class FMHARef {
   }
 
  private:
-  const platform::CUDADeviceContext& dev_ctx_;
+  const phi::GPUContext& dev_ctx_;
 
   int64_t batch_size_;
   int64_t seq_len_;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 2c3fd75d8e0..ed904df93df 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -43,7 +43,7 @@ using Tensor = framework::Tensor;
 template <typename T>
 static void AllReduce(framework::Tensor &tensor,  // NOLINT
                       const int ring_id,
-                      const platform::CUDADeviceContext &ctx) {
+                      const phi::GPUContext &ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index c94aae0dd49..53984707d50 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -37,7 +37,7 @@ template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 template <typename T>
-class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
+class FusedBatchNormActKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -88,7 +88,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
     const DataLayout data_layout = DataLayout::kNHWC;
     ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     if ((N * H * W * D) == 1) {
       // Only 1 element in normalization dimension,
       // skip the batch norm calculation, let y = act(x).
@@ -217,7 +217,7 @@ class FusedBatchNormActKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T>
-class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
+class FusedBatchNormActGradKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -268,7 +268,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
         platform::errors::PreconditionNotMet(
             "The size of scale is equal to the channel of Input(X)."));
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     if ((N * H * W * D) == 1) {
       if (act_type == "relu") {
         auto x_v = framework::EigenVector<T>::Flatten(*x);
@@ -281,9 +281,7 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
         PADDLE_THROW(
             platform::errors::Unimplemented("Unsupported activation type"));
       }
-      phi::funcs::SetConstant<platform::CUDADeviceContext,
-                              BatchNormParamType<T>>
-          functor;
+      phi::funcs::SetConstant<phi::GPUContext, BatchNormParamType<T>> functor;
       functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
       return;
@@ -402,12 +400,12 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     fused_batch_norm_act,
-    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, float>,
-    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, double>,
-    ops::FusedBatchNormActKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::FusedBatchNormActKernel<phi::GPUContext, float>,
+    ops::FusedBatchNormActKernel<phi::GPUContext, double>,
+    ops::FusedBatchNormActKernel<phi::GPUContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     fused_batch_norm_act_grad,
-    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, float>,
-    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, double>,
-    ops::FusedBatchNormActGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::FusedBatchNormActGradKernel<phi::GPUContext, float>,
+    ops::FusedBatchNormActGradKernel<phi::GPUContext, double>,
+    ops::FusedBatchNormActGradKernel<phi::GPUContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index e703ce810cd..23dbbe2ad08 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -36,7 +36,7 @@ template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 template <typename T>
-class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+class FusedBatchNormAddActKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -81,7 +81,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
     const DataLayout data_layout = DataLayout::kNHWC;
     ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // ------------------- cudnn descriptors ---------------------
     auto handle = dev_ctx.cudnn_handle();
@@ -194,7 +194,7 @@ class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T>
-class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -243,7 +243,7 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
         platform::errors::PreconditionNotMet(
             "The size of scale is equal to the channel of Input(X)."));
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     std::vector<int> dims = {N, C, H, W, D};
     std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
@@ -353,9 +353,8 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     fused_bn_add_activation,
-    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::FusedBatchNormAddActKernel<phi::GPUContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     fused_bn_add_activation_grad,
-    ops::FusedBatchNormAddActGradKernel<plat::CUDADeviceContext,
-                                        plat::float16>);
+    ops::FusedBatchNormAddActGradKernel<phi::GPUContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 5016cb65fb7..732da5fa52a 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -125,7 +125,7 @@ void LaunchDropoutActBias(Functor act_functor,
                           const T *bias,
                           T *dst,
                           MaskType *mask_data,
-                          const platform::CUDADeviceContext &ctx) {
+                          const phi::GPUContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     SetZero<T>(ctx, dst, rows * cols);
@@ -277,7 +277,7 @@ void LaunchDropoutActBiasGrad(Functor act_functor,
                               const uint32_t cols,
                               T *dx,
                               T *dbias,
-                              const platform::CUDADeviceContext &ctx) {
+                              const phi::GPUContext &ctx) {
   const T zero = static_cast<T>(0.0);
   auto factor = dropout_prob == static_cast<float>(1.0f)
                     ? zero
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
index 18f51b5d02b..06810c18cc0 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu
@@ -57,7 +57,7 @@ struct TestFusedDropoutActBias {
   std::vector<uint8_t> correct_mask;
 
   platform::CUDAPlace place;
-  platform::CUDADeviceContext *ctx;
+  phi::GPUContext *ctx;
 
   TestFusedDropoutActBias() {
     rows = 32;
@@ -69,7 +69,7 @@ struct TestFusedDropoutActBias {
     has_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    ctx = reinterpret_cast<phi::GPUContext *>(devicectx);
   }
 
   TestFusedDropoutActBias(int rows_,
@@ -87,7 +87,7 @@ struct TestFusedDropoutActBias {
     has_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    ctx = reinterpret_cast<phi::GPUContext *>(devicectx);
   }
 
   ~TestFusedDropoutActBias() {}
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index faac7691ae2..0f37d242ebc 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -40,7 +40,7 @@ namespace operators {
  * 2D grids: gridDim.y = rows
  */
 inline platform::GpuLaunchConfig Get1DBlocksAnd2DGrids(
-    const platform::CUDADeviceContext &ctx,
+    const phi::GPUContext &ctx,
     const uint32_t rows,
     const uint32_t cols,
     const int vec_size) {
@@ -101,9 +101,7 @@ __forceinline__ __device__ void RandVec<8>(curandStatePhilox4_32_10_t *state,
 }
 
 template <typename T>
-inline void SetZero(const platform::CUDADeviceContext &ctx,
-                    T *ptr,
-                    const size_t size) {
+inline void SetZero(const phi::GPUContext &ctx, T *ptr, const size_t size) {
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaMemsetAsync(ptr, 0, size * sizeof(T), ctx.stream()));
 }
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index aa4c6622f70..208b2a58bca 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -100,8 +100,7 @@ struct DropoutParam {
     seed_val = context.Attr<int>(pre_fix + "seed");
   }
 
-  int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx,
-                             const int offset) {
+  int UpdateSeedAndIncrement(const phi::GPUContext& ctx, const int offset) {
     uint64_t tmp_increment;
     GetSeedDataAndIncrement(
         ctx, tensor_seed, fix_seed, seed_val, offset, &seed, &tmp_increment);
@@ -113,7 +112,7 @@ struct DropoutParam {
 template <typename T, typename MaskType>
 class FusedDropoutHelper {
  private:
-  int GetIncrement(const platform::CUDADeviceContext& ctx) {
+  int GetIncrement(const phi::GPUContext& ctx) {
     const int VecSize = MAX_CACHE_BYTES / sizeof(T);
     const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1;
     auto config = Get1DBlocksAnd2DGrids(ctx,
@@ -130,7 +129,7 @@ class FusedDropoutHelper {
 
  public:
   FusedDropoutHelper() {}
-  FusedDropoutHelper(const platform::CUDADeviceContext& ctx,
+  FusedDropoutHelper(const phi::GPUContext& ctx,
                      const int rows,
                      const int cols,
                      const DropoutParam& dropout_param) {
@@ -140,7 +139,7 @@ class FusedDropoutHelper {
   }
 
   // out = residual + dropout( src + bias )
-  void ResidualDropoutBias(const platform::CUDADeviceContext& ctx,
+  void ResidualDropoutBias(const phi::GPUContext& ctx,
                            const T* src,
                            const T* residual,
                            const T* bias,
@@ -162,7 +161,7 @@ class FusedDropoutHelper {
                                            ctx);
   }
 
-  void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx,
+  void ResidualDropoutBiasGrad(const phi::GPUContext& ctx,
                                const T* d_out,
                                const MaskType* mask,
                                T* d_src,
@@ -189,7 +188,7 @@ class FusedDropoutHelper {
   }
 
   // out = dropout(activation(src + bias))
-  void DropoutActBias(const platform::CUDADeviceContext& ctx,
+  void DropoutActBias(const phi::GPUContext& ctx,
                       const T* src,
                       const T* bias,
                       const std::string& act_method,
@@ -234,7 +233,7 @@ class FusedDropoutHelper {
     }
   }
 
-  void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx,
+  void DropoutActBiasGrad(const phi::GPUContext& ctx,
                           const T* dout,
                           const T* src,
                           const T* bias,
@@ -297,7 +296,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     epsilon_ = epsilon;
   }
 
-  FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx,
+  FusedDropoutLayerNormHelper(const phi::GPUContext& ctx,
                               const int rows,
                               const int cols,
                               const DropoutParam& dropout_param,
@@ -308,7 +307,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
   }
 
   // call layer_norm
-  void LayerNorm(const platform::CUDADeviceContext& ctx,
+  void LayerNorm(const phi::GPUContext& ctx,
                  const T* src,
                  const LayerNormParamType<T>* gamma,
                  const LayerNormParamType<T>* beta,
@@ -324,7 +323,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     }
   }
 
-  void LayerNormGrad(const platform::CUDADeviceContext& ctx,
+  void LayerNormGrad(const phi::GPUContext& ctx,
                      const T* dout,
                      const T* src,
                      const LayerNormParamType<T>* gamma,
@@ -350,7 +349,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
 
   // out = layernorm(residual + dropout(src + bias))
   template <typename P = LayerNormParamType<T>, bool is_same_type = false>
-  void LayernormResidualDropoutBias(const platform::CUDADeviceContext& ctx,
+  void LayernormResidualDropoutBias(const phi::GPUContext& ctx,
                                     const T* src,
                                     const T* residual,
                                     const T* bias,
@@ -392,7 +391,7 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
   }
 
   template <typename P = LayerNormParamType<T>, bool is_same_type = false>
-  void LayernormResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx,
+  void LayernormResidualDropoutBiasGrad(const phi::GPUContext& ctx,
                                         const T* d_out,
                                         const T* layernorm_src,
                                         const MaskType* mask,
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index e3ab187f0d7..8fac3165f1c 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -49,7 +49,7 @@ void Dropout(const std::vector<T> &x,
              const framework::DDim &x_dim,
              std::vector<T> *out,
              std::vector<uint8_t> *mask,
-             const platform::CUDADeviceContext &ctx,
+             const phi::GPUContext &ctx,
              uint64_t seed,
              float dropout_prob,
              bool is_upscale_in_train,
@@ -97,7 +97,7 @@ void DropoutGrad(std::vector<T> *dx,
                  const framework::DDim &x_dim,
                  const std::vector<T> &dout,
                  const std::vector<uint8_t> &mask,
-                 const platform::CUDADeviceContext &ctx,
+                 const phi::GPUContext &ctx,
                  float dropout_prob,
                  bool is_upscale_in_train) {
   framework::Scope scope;
@@ -148,7 +148,7 @@ void LayerNorm(const std::vector<LayerNormParamType<T>> &scale,
                const float epsilon,
                const int rows,
                const int cols,
-               const platform::CUDADeviceContext &ctx) {
+               const phi::GPUContext &ctx) {
   framework::Scope scope;
   auto place = ctx.GetPlace();
   paddle::optional<framework::LoDTensor> scale_opt;
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
index 7b44aa82e4a..80b10021c09 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cu
@@ -17,36 +17,28 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fused_elemwise_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       double>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext, float>,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext, double>,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext,
                                        paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_elemwise_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext, float>,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext, double>,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext,
                                            paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_elemwise_add_activation,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
-                                       double>,
-    ops::FusedElemwiseActivationKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext, float>,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext, double>,
+    ops::FusedElemwiseActivationKernel<phi::GPUContext,
                                        paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_elemwise_add_activation_grad,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FusedElemwiseActivationGradKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext, float>,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext, double>,
+    ops::FusedElemwiseActivationGradKernel<phi::GPUContext,
                                            paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index d102c5e4705..abc9b451d17 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -121,5 +121,4 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fused_embedding_eltwise_layernorm,
-    ops::EmbeddingEltWiseLayerNormKernel<paddle::platform::CUDADeviceContext,
-                                         float>);
+    ops::EmbeddingEltWiseLayerNormKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 37aa5cbd14d..3e117c45359 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -397,8 +397,8 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
     const T* w_data = w->data<T>();
     T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     blas.GEMM(false,
               false,
               M,
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 4126f5ad726..60b5ecfdd74 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -36,7 +36,7 @@ using Tensor = framework::Tensor;
 template <typename T>
 static void AllReduce(framework::Tensor& tensor,  // NOLINT
                       const int ring_id,
-                      const platform::CUDADeviceContext& ctx) {
+                      const phi::GPUContext& ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
@@ -73,7 +73,7 @@ static void AllReduce(framework::Tensor& tensor,  // NOLINT
 template <typename DeviceContext, typename T>
 class FusedFeedForwardKernel : public framework::OpKernel<T> {
  public:
-  void MatMul(const platform::CUDADeviceContext& ctx,
+  void MatMul(const phi::GPUContext& ctx,
               const framework::Tensor& a,
               const framework::Tensor& b,
               framework::Tensor* c) const {
@@ -86,7 +86,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
     blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0));
   }
 
-  void FFN(const platform::CUDADeviceContext& ctx,
+  void FFN(const phi::GPUContext& ctx,
            const framework::Tensor& x,
            const framework::Tensor& linear1_weight,
            const framework::Tensor* linear1_bias,
@@ -309,7 +309,7 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
  public:
-  void MatMulGrad(const platform::CUDADeviceContext& ctx,
+  void MatMulGrad(const phi::GPUContext& ctx,
                   const framework::Tensor& d_out,
                   const framework::Tensor& a,
                   const framework::Tensor& b,
@@ -327,7 +327,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0));
   }
 
-  void FFNGrad(const platform::CUDADeviceContext& ctx,
+  void FFNGrad(const phi::GPUContext& ctx,
                const framework::Tensor& d_out,
                const framework::Tensor& x,
                const framework::Tensor& dropout1_mask,
@@ -630,14 +630,12 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fused_feedforward,
-    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FusedFeedForwardKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::float16>);
+    ops::FusedFeedForwardKernel<phi::GPUContext, float>,
+    ops::FusedFeedForwardKernel<phi::GPUContext, double>,
+    ops::FusedFeedForwardKernel<phi::GPUContext, paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     fused_feedforward_grad,
-    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    ops::FusedFeedForwardGradKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedFeedForwardGradKernel<phi::GPUContext, float>,
+    ops::FusedFeedForwardGradKernel<phi::GPUContext, double>,
+    ops::FusedFeedForwardGradKernel<phi::GPUContext,
                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index 45d47908b99..f9d9fad110e 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -44,7 +44,7 @@ inline std::string MemoryDebugString(const Tensor& t) {
 }
 
 template <typename T>
-void AllocWithDebugInfo(const platform::CUDADeviceContext& dev_ctx,
+void AllocWithDebugInfo(const phi::GPUContext& dev_ctx,
                         const std::string& info,
                         Tensor* t) {
   t->mutable_data<T>(dev_ctx.GetPlace());
@@ -59,7 +59,7 @@ struct TernaryAddFunctor {
 template <typename T>
 struct GateAttentionConfig {
  public:
-  const platform::CUDADeviceContext& dev_ctx;
+  const phi::GPUContext& dev_ctx;
 
   bool merge_qkv;
   bool has_gating;
@@ -86,7 +86,7 @@ struct GateAttentionConfig {
   phi::DDim qktv_out_dims;
   phi::DDim gate_out_dims;
 
-  GateAttentionConfig(const platform::CUDADeviceContext& dev_ctx,
+  GateAttentionConfig(const phi::GPUContext& dev_ctx,
                       const Tensor* query,
                       const Tensor* key,
                       const Tensor* query_weight,
@@ -249,7 +249,7 @@ struct GateAttentionConfig {
 template <typename T>
 struct GateAttentionGradConfig : public GateAttentionConfig<T> {
  public:
-  GateAttentionGradConfig(const platform::CUDADeviceContext& dev_ctx,
+  GateAttentionGradConfig(const phi::GPUContext& dev_ctx,
                           const Tensor* query,
                           const Tensor* key,
                           const Tensor* query_weight,
@@ -322,7 +322,7 @@ struct GateAttentionGradConfig : public GateAttentionConfig<T> {
 template <typename T>
 class FMHAGateRef {
  public:
-  FMHAGateRef(const platform::CUDADeviceContext& dev_ctx, bool merge_qkv)
+  FMHAGateRef(const phi::GPUContext& dev_ctx, bool merge_qkv)
       : dev_ctx_(dev_ctx), merge_qkv_(merge_qkv) {}
 
   void ComputeForward(const Tensor* nonbatched_bias,
@@ -748,7 +748,7 @@ class FMHAGateRef {
     int64_t stride_a = m * k;
     int64_t stride_b = k * n;
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx_);
     blas.BatchedGEMM(cblas_trans_a,
                      cblas_trans_b,
                      m,
@@ -764,7 +764,7 @@ class FMHAGateRef {
                      stride_b);
   }
 
-  const platform::CUDADeviceContext& dev_ctx_;
+  const phi::GPUContext& dev_ctx_;
   bool merge_qkv_;
 };
 
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 7400246f407..139a365c10e 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -350,7 +350,7 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
     const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
     const bool has_gating = ctx.Attr<bool>("has_gating");
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out);
     AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out);
     if (has_gating) {
@@ -441,7 +441,7 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     bool has_gating = ctx.Attr<bool>("has_gating");
     bool merge_qkv = ctx.Attr<bool>("merge_qkv");
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad);
 
     GateAttentionGradConfig<T> config(
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 3ebb9f9e640..219a517315b 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -29,7 +29,7 @@ template <typename DeviceContext, typename T>
 class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
@@ -320,7 +320,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
   template <bool TransX, bool TransY>
   static void ComputeImpl(const framework::ExecutionContext& ctx) {
     using Trait = FusedGEMMGradTrait<TransX, TransY>;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     const Tensor* dout = ctx.Input<Tensor>("DOut");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
@@ -677,17 +677,14 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     fused_gemm_epilogue,
-    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FusedGemmEpilogueKernel<paddle::platform::CUDADeviceContext,
-                                 paddle::platform::float16>);
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, float>,
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, double>,
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_gemm_epilogue_grad,
-    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>,
-    ops::FusedGemmEpilogueGradKernel<paddle::platform::CUDADeviceContext,
+    ops::FusedGemmEpilogueGradKernel<phi::GPUContext, float>,
+    ops::FusedGemmEpilogueGradKernel<phi::GPUContext, double>,
+    ops::FusedGemmEpilogueGradKernel<phi::GPUContext,
                                      paddle::platform::float16>);
 #endif
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 301b62524a5..7bb3498567c 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -688,7 +688,7 @@ void LaunchLayernormResidualDropoutBias(
     T *layernorm_dst,
     LayerNormParamType<T> *mean,
     LayerNormParamType<T> *var,
-    const platform::CUDADeviceContext &ctx) {
+    const phi::GPUContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     auto cuda_place = ctx.GetPlace();
@@ -846,7 +846,7 @@ template <typename T,
           typename MaskType,
           bool ScaleBiasWithSameTypeX = false>
 void LaunchLayernormResidualDropoutGrad(
-    const platform::CUDADeviceContext &dev_ctx,
+    const phi::GPUContext &dev_ctx,
     const uint32_t rows,
     const uint32_t cols,
     const float epsilon,
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
index 4f8ceba177e..d3c6cca95ef 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu
@@ -54,7 +54,7 @@ struct TestFusedLayernormResidualDropoutBias {
   std::vector<uint8_t> correct_mask;
 
   platform::CUDAPlace place;
-  platform::CUDADeviceContext *ctx;
+  phi::GPUContext *ctx;
 
   TestFusedLayernormResidualDropoutBias() {
     rows = 32;
@@ -69,7 +69,7 @@ struct TestFusedLayernormResidualDropoutBias {
     epsilon = 0.00001f;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    ctx = reinterpret_cast<phi::GPUContext *>(devicectx);
   }
 
   TestFusedLayernormResidualDropoutBias(int _rows,
@@ -92,7 +92,7 @@ struct TestFusedLayernormResidualDropoutBias {
     has_layernorm_bias = true;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto devicectx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(devicectx);
+    ctx = reinterpret_cast<phi::GPUContext *>(devicectx);
   }
 
   ~TestFusedLayernormResidualDropoutBias() {}
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index a8bebd5012d..a858b31e23c 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -49,7 +49,7 @@ using Tensor = framework::Tensor;
 template <typename T>
 static void AllReduce(framework::Tensor &tensor,  // NOLINT
                       const int ring_id,
-                      const platform::CUDADeviceContext &ctx) {
+                      const phi::GPUContext &ctx) {
   if (ring_id == -1) return;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
@@ -996,7 +996,7 @@ void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
 }
 
 template <typename T>
-void fmha(const platform::CUDADeviceContext &dev_ctx,
+void fmha(const phi::GPUContext &dev_ctx,
           const Tensor &qkv_tensor,
           const Tensor &qkv_bias_tensor,
           const Tensor &src_mask_tensor,
@@ -1118,7 +1118,7 @@ __global__ void write_cache_v_kernel(T *cache_v,
 }
 
 template <typename T>
-void write_cache_kv(const platform::CUDADeviceContext &dev_ctx,
+void write_cache_kv(const phi::GPUContext &dev_ctx,
                     T *cache_k,
                     T *cache_v,
                     const T *k,
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index f9bf4c3c5a3..c1131cae5d8 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -178,7 +178,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
                                const T *bias,
                                MaskType *mask_data,
                                T *dst,
-                               const platform::CUDADeviceContext &ctx) {
+                               const phi::GPUContext &ctx) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     if (residual == dst) return;
@@ -323,7 +323,7 @@ void LaunchResidualDropoutBiasGrad(const T *dout,
                                    const uint32_t cols,
                                    T *dx,
                                    T *dbias,
-                                   const platform::CUDADeviceContext &ctx) {
+                                   const phi::GPUContext &ctx) {
   const T zero = static_cast<T>(0.0f);
   auto factor = dropout_prob == static_cast<float>(1.0f)
                     ? zero
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
index 2ff0d3dc036..ba0652339e9 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu
@@ -58,7 +58,7 @@ struct FusedResidualDropoutBiasTester {
   std::vector<uint8_t> correct_mask;
 
   platform::CUDAPlace place;
-  platform::CUDADeviceContext *ctx;
+  phi::GPUContext *ctx;
 
   FusedResidualDropoutBiasTester() {
     rows = 32;
@@ -69,7 +69,7 @@ struct FusedResidualDropoutBiasTester {
     is_test = false;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto device_ctx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
+    ctx = reinterpret_cast<phi::GPUContext *>(device_ctx);
   }
 
   FusedResidualDropoutBiasTester(int rows,
@@ -86,7 +86,7 @@ struct FusedResidualDropoutBiasTester {
         is_test(is_test) {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto device_ctx = pool.Get(place);
-    ctx = reinterpret_cast<platform::CUDADeviceContext *>(device_ctx);
+    ctx = reinterpret_cast<phi::GPUContext *>(device_ctx);
   }
 
   void SetUp() {
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 6aba49ea33f..a6a49b7ac62 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -114,9 +114,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
                      const float padding_value,
                      const bool use_cvm,
                      const int cvm_offset) {
-  auto stream =
-      ctx.template device_context<platform::CUDADeviceContext>().stream();
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto stream = ctx.template device_context<phi::GPUContext>().stream();
+  auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   size_t total_ptr_len = input_data.size() + output_data.size() +
                          seqpool_output_data.size() + lods.size();
   auto temp_ptr =
@@ -320,9 +319,8 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                          const int embedding_size,
                          const bool use_cvm,
                          const int cvm_offset) {
-  auto stream =
-      ctx.template device_context<platform::CUDADeviceContext>().stream();
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto stream = ctx.template device_context<phi::GPUContext>().stream();
+  auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
   size_t total_ptr_len = out_grads_data.size() + in_grads_data.size() +
                          cvm_data.size() + lods.size();
   auto temp_ptr =
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index 22da713f87d..ce892024d8d 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -39,7 +39,7 @@ template <typename T>
 class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto* input = ctx.Input<Tensor>("Input");
     auto filters = ctx.MultiInput<framework::Tensor>("Filter");
     auto bias = ctx.MultiInput<framework::Tensor>("Bias");
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cu.cc b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
index 9a81a50efba..9ce8842a015 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cu.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    fusion_group,
-    ops::FusionGroupKernel<plat::CUDADeviceContext, float>,
-    ops::FusionGroupKernel<plat::CUDADeviceContext, double>,
-    ops::FusionGroupKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fusion_group,
+                        ops::FusionGroupKernel<phi::GPUContext, float>,
+                        ops::FusionGroupKernel<phi::GPUContext, double>,
+                        ops::FusionGroupKernel<phi::GPUContext, plat::float16>);
diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
index 89058cc3fd9..9a1e58c6320 100644
--- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
+++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc
@@ -52,7 +52,7 @@ class TransposeFlattenConcatFusionKernel : public framework::OpKernel<T> {
         platform::dynload::cudnnCreateTensorDescriptor(&out_desc));
     cudnnDataType_t cudnn_dtype = CudnnDataType<T>::type;
 
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
 
     T* odata = out->data<T>();
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 4fd51aec24a..8a6d5b313ad 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -237,7 +237,7 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
 
     // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(device_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
     blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
 
     // temp_out_tensor.Resize(temp_out_dims);
@@ -285,6 +285,5 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    multihead_matmul,
-    ops::MultiHeadMatMulV2Kernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(multihead_matmul,
+                        ops::MultiHeadMatMulV2Kernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu
index 188c2b21be0..d0a8788e0db 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cu
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cu
@@ -90,7 +90,7 @@ class ResNetUnitKernel : public framework::OpKernel<T> {
                         output_channel;
 
     auto place = ctx.GetPlace();
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // 1. Conv
     Tensor sum_x;
@@ -268,7 +268,7 @@ class ResNetUnitGradKernel : public framework::OpKernel<T> {
     auto bitmask_shape = phi::vectorize<int>(bitmask->dims());
 
     auto place = ctx.GetPlace();
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad,
     // scale_x_grad, bias_x_grad
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
index 117228d2864..1f9640dd4ba 100644
--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -69,6 +69,5 @@ class SkipLayerNormKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    skip_layernorm,
-    ops::SkipLayerNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(skip_layernorm,
+                        ops::SkipLayerNormKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/fused/yolo_box_head_op.cu b/paddle/fluid/operators/fused/yolo_box_head_op.cu
index d20ffa274a8..b82b9a931a1 100644
--- a/paddle/fluid/operators/fused/yolo_box_head_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_head_op.cu
@@ -72,8 +72,7 @@ class YoloBoxHeadKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     auto anchors = context.Attr<std::vector<int>>("anchors");
     auto class_num = context.Attr<int>("class_num");
-    auto& device_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& device_ctx = context.template device_context<phi::GPUContext>();
     auto x_dims = x->dims();
     const int batch_size = x_dims[0];
     const int h = x_dims[2];
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu
index 6ebf9b8eb31..c259d0efb49 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cu
@@ -587,9 +587,9 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask,
-    ops::SoftmaxMaskFuseKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::SoftmaxMaskFuseKernel<plat::CUDADeviceContext, float>);
+    ops::SoftmaxMaskFuseKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask_grad,
-    ops::SoftmaxMaskFuseGradKernel<plat::CUDADeviceContext, plat::float16>,
-    ops::SoftmaxMaskFuseGradKernel<plat::CUDADeviceContext, float>);
+    ops::SoftmaxMaskFuseGradKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 1849108ed66..54db576d317 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -577,12 +577,9 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask_upper_triangle,
-    ops::SoftmaxMaskFuseUpperTriangleKernel<plat::CUDADeviceContext,
-                                            plat::float16>,
-    ops::SoftmaxMaskFuseUpperTriangleKernel<plat::CUDADeviceContext, float>);
+    ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseUpperTriangleKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     fused_softmax_mask_upper_triangle_grad,
-    ops::SoftmaxMaskFuseUpperTriangleGradKernel<plat::CUDADeviceContext,
-                                                plat::float16>,
-    ops::SoftmaxMaskFuseUpperTriangleGradKernel<plat::CUDADeviceContext,
-                                                float>);
+    ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, plat::float16>,
+    ops::SoftmaxMaskFuseUpperTriangleGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index a7b64223be7..fa28481f4c4 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -143,8 +143,7 @@ struct gpu_gather_scatter_functor {
     int block = 512;
     int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
     int64_t grid = (n + block - 1) / block;
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
     GatherScatterGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
         <<<grid, block, 0, stream>>>(self_data,
                                      dim,
@@ -257,8 +256,7 @@ void gpu_scatter_input_grad_kernel(Tensor self,
   int block = 512;
   int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
   int64_t grid = (n + block - 1) / block;
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
 
   ScatterInputGradGPUKernel<tensor_t, index_t>
       <<<grid, block, 0, stream>>>(grad_data,
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 1e89091b202..81b53c8b949 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -61,8 +61,7 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 
     int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::DefaultCUDAGenerator(device_id);
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& dev_cxt = context.template device_context<phi::GPUContext>();
 
     if (seed == 0) {
       // use global Generator seed
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index 06720f1db11..fc8f195fb70 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -245,8 +245,7 @@ void SampleNeighbors(const framework::ExecutionContext& ctx,
       <<<grid,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(0,
                           k,
                           bs,
@@ -305,8 +304,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
       <<<grid,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(input,
                           num_input,
                           len_hashtable,
@@ -319,8 +317,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
       <<<grid,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(input,
                           thrust::raw_pointer_cast(item_count.data()),
                           num_input,
@@ -338,8 +335,7 @@ void FillHashTable(const framework::ExecutionContext& ctx,
       <<<grid,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(input,
                           num_input,
                           len_hashtable,
@@ -398,8 +394,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
       <<<grid,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(thrust::raw_pointer_cast(outputs->data()),
                           outputs->size(),
                           size,
@@ -411,8 +406,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
       <<<grid_,
          block,
          0,
-         reinterpret_cast<const platform::CUDADeviceContext&>(
-             ctx.device_context())
+         reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
              .stream()>>>(thrust::raw_pointer_cast(orig_nodes->data()),
                           bs,
                           thrust::raw_pointer_cast(reindex_nodes->data()),
@@ -625,8 +619,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
         <<<grid,
            block,
            0,
-           reinterpret_cast<const platform::CUDADeviceContext&>(
-               ctx.device_context())
+           reinterpret_cast<const phi::GPUContext&>(ctx.device_context())
                .stream()>>>(
             unique_dst_size,
             thrust::raw_pointer_cast(unique_dst_merge_reindex.data()),
@@ -650,7 +643,7 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(graph_khop_sampler,
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 48872cb8caa..da9ccdf627f 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -41,7 +41,7 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "It must use CUDAPlace when using CUDA Kernel"));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
@@ -90,7 +90,7 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel<T> {
                       true,
                       platform::errors::InvalidArgument(
                           "It must use CUDAPlace when using CUDA Kernel"));
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto* input = ctx.Input<Tensor>("X");
     auto* grid = ctx.Input<Tensor>("Grid");
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index abf367f70e2..668f69b4c75 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -261,8 +261,7 @@ __global__ void GroupNormForward(const T* x,
 }
 
 template <typename T>
-class GroupNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class GroupNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -291,8 +290,8 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     y->mutable_data<T>(ctx.GetPlace());
     mean->mutable_data<T>(ctx.GetPlace());
     var->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     Tensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
     auto* x_data = x->data<T>();
@@ -597,8 +596,7 @@ __global__ void GetXGradientCUDAKernel(int imsize,
 }
 
 template <typename T>
-class GroupNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
@@ -629,8 +627,8 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
                                           : x_dims[x_dims.size() - 2]);
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     Tensor ds, db;
     ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
@@ -816,11 +814,9 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    group_norm,
-    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    group_norm_grad,
-    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(group_norm,
+                        ops::GroupNormKernel<phi::GPUContext, float>,
+                        ops::GroupNormKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(group_norm_grad,
+                        ops::GroupNormGradKernel<phi::GPUContext, float>,
+                        ops::GroupNormGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index 37ba915a24f..f3665da1816 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -131,11 +131,9 @@ class GRUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru,
-    ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_grad,
-    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(gru,
+                        ops::GRUKernel<phi::GPUContext, float>,
+                        ops::GRUKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(gru_grad,
+                        ops::GRUGradKernel<phi::GPUContext, float>,
+                        ops::GRUGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
index 979a20a64ee..adaaf1d09cd 100644
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -14,11 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit,
-    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    gru_unit_grad,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(gru_unit,
+                        ops::GRUUnitKernel<phi::GPUContext, float>,
+                        ops::GRUUnitKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(gru_unit_grad,
+                        ops::GRUUnitGradKernel<phi::GPUContext, float>,
+                        ops::GRUUnitGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 835312851b2..0d1006658a4 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -155,9 +155,7 @@ REGISTER_OP_CPU_KERNEL(hinge_loss,
 REGISTER_OP_CPU_KERNEL(hinge_loss_grad,
                        ops::HingeLossGradKernel<phi::CPUContext, float>);
 
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(hinge_loss,
+                        ops::HingeLossKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(hinge_loss_grad,
+                        ops::HingeLossGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 503b64c3431..b58f9a55756 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -200,9 +200,7 @@ REGISTER_OP_CPU_KERNEL(im2sequence,
 REGISTER_OP_CPU_KERNEL(im2sequence_grad,
                        ops::Im2SequenceGradKernel<phi::CPUContext, float>);
 
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(im2sequence,
+                        ops::Im2SequenceKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(im2sequence_grad,
+                        ops::Im2SequenceGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index a63cd8b0071..044b8118abb 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -225,16 +225,14 @@ namespace plat = paddle::platform;
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
 REGISTER_OP_CUDA_KERNEL(inplace_abn,
-                        ops::InplaceABNKernel<plat::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    inplace_abn_grad,
-    ops::InplaceABNGradKernel<plat::CUDADeviceContext, float>);
+                        ops::InplaceABNKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(inplace_abn_grad,
+                        ops::InplaceABNGradKernel<phi::GPUContext, float>);
 #else
 REGISTER_OP_CUDA_KERNEL(inplace_abn,
-                        ops::InplaceABNKernel<plat::CUDADeviceContext, float>,
-                        ops::InplaceABNKernel<plat::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    inplace_abn_grad,
-    ops::InplaceABNGradKernel<plat::CUDADeviceContext, float>,
-    ops::InplaceABNGradKernel<plat::CUDADeviceContext, double>);
+                        ops::InplaceABNKernel<phi::GPUContext, float>,
+                        ops::InplaceABNKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(inplace_abn_grad,
+                        ops::InplaceABNGradKernel<phi::GPUContext, float>,
+                        ops::InplaceABNGradKernel<phi::GPUContext, double>);
 #endif
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 45b2a05211e..80534d29b5a 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -1337,8 +1337,8 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::GPUContext>();
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1432,8 +1432,8 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::GPUContext>();
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1581,8 +1581,8 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
     dim_grad = {n, in_d, in_h, in_w, c};
   }
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-  auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
+  auto& device_ctx = ctx.template device_context<phi::GPUContext>();
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
index 88447aa830f..d8e18f58fa9 100644
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -17,44 +17,23 @@
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(isinf,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int,
-                                            ops::InfinityFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float,
-                                            ops::InfinityFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double,
-                                            ops::InfinityFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16,
-                                            ops::InfinityFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    isinf,
+    ops::OverflowKernel<phi::GPUContext, int, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext, float, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext, double, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>);
 
-REGISTER_OP_CUDA_KERNEL(isnan,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int,
-                                            ops::NANFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float,
-                                            ops::NANFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double,
-                                            ops::NANFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16,
-                                            ops::NANFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    isnan,
+    ops::OverflowKernel<phi::GPUContext, int, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, float, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, double, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>);
 
-REGISTER_OP_CUDA_KERNEL(isfinite,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            int,
-                                            ops::IsfiniteFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            float,
-                                            ops::IsfiniteFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            double,
-                                            ops::IsfiniteFunctor>,
-                        ops::OverflowKernel<paddle::platform::CUDADeviceContext,
-                                            plat::float16,
-                                            ops::IsfiniteFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    isfinite,
+    ops::OverflowKernel<phi::GPUContext, int, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::GPUContext, float, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::GPUContext, double, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>);
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index c7bf0d538bd..093a33d89b0 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -97,8 +97,6 @@ REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel<phi::CPUContext, float>);
 REGISTER_OP_CPU_KERNEL(l1_norm_grad,
                        ops::L1NormGradKernel<phi::CPUContext, float>);
 
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(l1_norm, ops::L1NormKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(l1_norm_grad,
+                        ops::L1NormGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index 01abe645495..d14cc076261 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -61,8 +61,7 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
 
     auto n_expert = expert_count->numel() / n_worker;
     const auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     dim3 grid_dim(256);
     dim3 block_dim(1024);
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 240f6b06325..008305bdb93 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -85,7 +85,7 @@ class LiteEngineOp : public framework::OperatorBase {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(dev_place)) {
       platform::GpuStreamSync(
-          static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
+          static_cast<const phi::GPUContext *>(ctx)->stream());
     }
 #endif
     VLOG(3) << "lite engine run";
@@ -103,7 +103,7 @@ class LiteEngineOp : public framework::OperatorBase {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(dev_place)) {
       platform::GpuStreamSync(
-          static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
+          static_cast<const phi::GPUContext *>(ctx)->stream());
     }
 #endif
   }
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index fed71abe166..d631c3c7317 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -72,7 +72,7 @@ TEST(LiteEngineOp, engine_op) {
   framework::Scope scope;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index 574b7cbec28..b8892e9c88f 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -58,7 +58,7 @@ void serialize_params(std::string* str,
   std::ostringstream os;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
 #else
   phi::CPUContext ctx;
 #endif
diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu
index 2a42c0daa7f..9405b3564b9 100644
--- a/paddle/fluid/operators/load_combine_op.cu
+++ b/paddle/fluid/operators/load_combine_op.cu
@@ -16,10 +16,9 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(load_combine,
+                        ops::LoadCombineOpKernel<phi::GPUContext, float>,
+                        ops::LoadCombineOpKernel<phi::GPUContext, double>,
+                        ops::LoadCombineOpKernel<phi::GPUContext, int>,
+                        ops::LoadCombineOpKernel<phi::GPUContext, int8_t>,
+                        ops::LoadCombineOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu
index c122978d12c..04c456ac603 100644
--- a/paddle/fluid/operators/load_op.cu
+++ b/paddle/fluid/operators/load_op.cu
@@ -16,10 +16,9 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    load,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(load,
+                        ops::LoadOpKernel<phi::GPUContext, float>,
+                        ops::LoadOpKernel<phi::GPUContext, double>,
+                        ops::LoadOpKernel<phi::GPUContext, int>,
+                        ops::LoadOpKernel<phi::GPUContext, int8_t>,
+                        ops::LoadOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
index a910ad549f1..25aad4c4afc 100644
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -16,15 +16,13 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    lod_reset_grad,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(lod_reset,
+                        ops::LoDResetKernel<phi::GPUContext, float>,
+                        ops::LoDResetKernel<phi::GPUContext, double>,
+                        ops::LoDResetKernel<phi::GPUContext, int>,
+                        ops::LoDResetKernel<phi::GPUContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(lod_reset_grad,
+                        ops::LoDResetGradKernel<phi::GPUContext, float>,
+                        ops::LoDResetGradKernel<phi::GPUContext, double>,
+                        ops::LoDResetGradKernel<phi::GPUContext, int>,
+                        ops::LoDResetGradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index d4b36f31e62..ab4d95c592f 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      Apply(static_cast<platform::CUDADeviceContext *>(dev_ctx));
+      Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
           platform::errors::Unavailable("Paddle is not compiled with CUDA."));
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 41740923b42..073077f6586 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -151,8 +151,7 @@ template <typename T>
 class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto &dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = context.template device_context<phi::GPUContext>();
     bool is_sparse = context.Attr<bool>("is_sparse");
 
     // Since paddings are not trainable and fixed in forward, the gradient of
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index ef9bd7865d6..7b4ed84fc20 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -142,8 +142,7 @@ struct LookupTableV2GradCUDAFunctor {
 
   template <typename IdT>
   void apply() {
-    auto &dev_ctx =
-        context_.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = context_.template device_context<phi::GPUContext>();
     bool is_sparse = context_.Attr<bool>("is_sparse");
 
     // Since paddings are not trainable and fixed in forward, the gradient of
diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
index c736dfb48a6..8c95cf1d0c9 100644
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
@@ -97,7 +97,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx,
   const int block_size = 1024;
   int grid_size = (img_size + block_size - 1) / block_size;
 
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
   KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
       img_size, inputs, mid, C, H, W, n, k, alpha, data_layout);
 
@@ -108,7 +108,7 @@ void CrossMapNormal(const framework::ExecutionContext& ctx,
 }
 
 template <typename T>
-struct LRNFunctor<platform::CUDADeviceContext, T> {
+struct LRNFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor& input,
                   framework::Tensor* out,
@@ -138,8 +138,8 @@ struct LRNFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct LRNFunctor<platform::CUDADeviceContext, float>;
-template struct LRNFunctor<platform::CUDADeviceContext, double>;
+template struct LRNFunctor<phi::GPUContext, float>;
+template struct LRNFunctor<phi::GPUContext, double>;
 
 template <typename T>
 __global__ void KeCMRNormDiff(int img_size,
@@ -218,7 +218,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx,
   const int block_size = 1024;
   int grid_size = (img_size + block_size - 1) / block_size;
 
-  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
   KeCMRNormDiff<T>
       <<<grid_size, block_size, 0, dev_ctx.stream()>>>(img_size,
                                                        x,
@@ -236,7 +236,7 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx,
 }
 
 template <typename T>
-struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+struct LRNGradFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor& x,
                   const framework::Tensor& out,
@@ -268,13 +268,11 @@ struct LRNGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
-template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+template struct LRNGradFunctor<phi::GPUContext, float>;
+template struct LRNGradFunctor<phi::GPUContext, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(lrn, ops::LRNKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(lrn_grad, ops::LRNGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
index 60364ef4486..13a0ded14b4 100644
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstm,
-    ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstm_grad,
-    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(lstm,
+                        ops::LSTMKernel<phi::GPUContext, float>,
+                        ops::LSTMKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(lstm_grad,
+                        ops::LSTMGradKernel<phi::GPUContext, float>,
+                        ops::LSTMGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu
index 11c2844ccc3..8614eaf5d49 100644
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/lstmp_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lstmp,
-    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    lstmp_grad,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(lstmp,
+                        ops::LSTMPKernel<phi::GPUContext, float>,
+                        ops::LSTMPKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(lstmp_grad,
+                        ops::LSTMPGradKernel<phi::GPUContext, float>,
+                        ops::LSTMPGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu
index f063716b200..e9d1a6a136a 100644
--- a/paddle/fluid/operators/lstsq_op.cu
+++ b/paddle/fluid/operators/lstsq_op.cu
@@ -38,10 +38,8 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
     auto* solution = context.Output<Tensor>("Solution");
 
     auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(context);
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+        math::DeviceIndependenceTensorOperations<phi::GPUContext, T>(context);
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     auto x_dims = x.dims();
     auto y_dims = y.dims();
@@ -163,20 +161,19 @@ class LstsqCUDAKernel : public framework::OpKernel<T> {
 };
 
 template <>
-void BatchedOrmqr<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& dev_ctx,
-    bool left,
-    bool transpose,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    float* a,
-    int a_stride,
-    float* tau,
-    int tau_stride,
-    float* other,
-    int other_stride) {
+void BatchedOrmqr<phi::GPUContext, float>(const phi::GPUContext& dev_ctx,
+                                          bool left,
+                                          bool transpose,
+                                          int batch_size,
+                                          int m,
+                                          int n,
+                                          int k,
+                                          float* a,
+                                          int a_stride,
+                                          float* tau,
+                                          int tau_stride,
+                                          float* other,
+                                          int other_stride) {
   int lwork = 0;
   auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
   auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -232,20 +229,19 @@ void BatchedOrmqr<platform::CUDADeviceContext, float>(
 }
 
 template <>
-void BatchedOrmqr<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& dev_ctx,
-    bool left,
-    bool transpose,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    double* a,
-    int a_stride,
-    double* tau,
-    int tau_stride,
-    double* other,
-    int other_stride) {
+void BatchedOrmqr<phi::GPUContext, double>(const phi::GPUContext& dev_ctx,
+                                           bool left,
+                                           bool transpose,
+                                           int batch_size,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           double* a,
+                                           int a_stride,
+                                           double* tau,
+                                           int tau_stride,
+                                           double* other,
+                                           int other_stride) {
   int lwork = 0;
   auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
   auto trans = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -305,9 +301,8 @@ void BatchedOrmqr<platform::CUDADeviceContext, double>(
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    lstsq,
-    ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LstsqCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(lstsq,
+                        ops::LstsqCUDAKernel<phi::GPUContext, float>,
+                        ops::LstsqCUDAKernel<phi::GPUContext, double>);
 
 #endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index b58142d793c..6d1ff9f296e 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -87,7 +87,7 @@ void GetClassInterval(const gpuStream_t& stream,
     const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
     // use global calculate stream
     const auto calcu_stream =
-        static_cast<platform::CUDADeviceContext*>(
+        static_cast<phi::GPUContext*>(
             platform::DeviceContextPool::Instance().Get(place))
             ->stream();
 
@@ -275,7 +275,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     const float scale = ctx.Attr<float>("scale");
 
     const auto& place = ctx.GetPlace();
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     platform::NCCLComm* comm;
@@ -290,7 +290,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
         comm = platform::NCCLCommContext::Instance().Get(rid, place);
 
         // use global calculate stream
-        stream = static_cast<platform::CUDADeviceContext*>(
+        stream = static_cast<phi::GPUContext*>(
                      platform::DeviceContextPool::Instance().Get(place))
                      ->stream();
       }
@@ -377,8 +377,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
     // step 2, obtain logit_max
     Tensor logits_max;
-    logits_max =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     T* logits_max_buff = logits_max.mutable_data<T>(place);
     TensorReduceImpl<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
         dev_ctx,
@@ -420,8 +419,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
 
     // step 4, sum(exp(logit - logit_max))
     Tensor sum_exp_logits;
-    sum_exp_logits =
-        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
     T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
     TensorReduceImpl<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
         dev_ctx,
@@ -465,7 +463,7 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
     // logit_max))))
     // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T>()(
+    phi::funcs::SetConstant<phi::GPUContext, T>()(
         dev_ctx, loss, static_cast<T>(0.0));
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
@@ -543,8 +541,7 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const float margin3 = context.Attr<float>("margin3");
     const float scale = context.Attr<float>("scale");
 
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     const auto sofrmax_dims = softmax->dims();
     const int axis = sofrmax_dims.size() - 1;
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
index d7e77e92302..f672381ed7a 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
@@ -16,9 +16,7 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss,
-    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    margin_rank_loss_grad,
-    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(margin_rank_loss,
+                        ops::MarginRankLossKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(margin_rank_loss_grad,
+                        ops::MarginRankLossGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
index 2c58b99396e..3b52788514b 100644
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
@@ -33,7 +33,7 @@ template <typename T>
 class MarkerOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     auto marker_role = ctx.Attr<std::string>("marker_role");
     auto marker_pos = ctx.Attr<std::string>("marker_pos");
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index 2008e6b3fa2..80af6f673c4 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -403,9 +403,9 @@ static inline int GetNumUsedThreads(const int max_threads_per_seq,
 }
 
 template <typename T>
-class BeamSearchFunctor<platform::CUDADeviceContext, T> {
+class BeamSearchFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::LoDTensor* pre_ids,
                   const framework::LoDTensor* pre_scores,
                   const framework::LoDTensor* ids,
@@ -531,10 +531,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template class BeamSearchFunctor<platform::CUDADeviceContext, int>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, float>;
-template class BeamSearchFunctor<platform::CUDADeviceContext, double>;
+template class BeamSearchFunctor<phi::GPUContext, int>;
+template class BeamSearchFunctor<phi::GPUContext, int64_t>;
+template class BeamSearchFunctor<phi::GPUContext, float>;
+template class BeamSearchFunctor<phi::GPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc
index f6b0349f1ca..87785bfdc85 100644
--- a/paddle/fluid/operators/math/beam_search_test.cc
+++ b/paddle/fluid/operators/math/beam_search_test.cc
@@ -144,15 +144,14 @@ void TestBeamSearch() {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-void TestBeamSearch<paddle::platform::CUDADeviceContext,
-                    paddle::platform::CUDAPlace>() {
+void TestBeamSearch<phi::GPUContext, paddle::platform::CUDAPlace>() {
   paddle::framework::LoDTensor ids;
   paddle::framework::LoDTensor scores;
   paddle::framework::LoDTensor pre_ids;
   paddle::framework::LoDTensor pre_scores;
 
   auto* place = new paddle::platform::CUDAPlace();
-  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  auto* context = new phi::GPUContext(*place);
   context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*place, context->stream())
                             .get());
@@ -185,9 +184,7 @@ void TestBeamSearch<paddle::platform::CUDADeviceContext,
   size_t level = 0;
   size_t beam_size = 2;
   int end_id = 0;
-  paddle::operators::math::
-      BeamSearchFunctor<paddle::platform::CUDADeviceContext, float>
-          beamsearch;
+  paddle::operators::math::BeamSearchFunctor<phi::GPUContext, float> beamsearch;
   beamsearch(*context,
              &pre_ids,
              &pre_scores,
@@ -235,7 +232,6 @@ TEST(BeamSearch, CPU) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(BeamSearch, GPU) {
-  TestBeamSearch<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
+  TestBeamSearch<phi::GPUContext, paddle::platform::CUDAPlace>();
 }
 #endif
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index b8c23cafe6d..42a54195def 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -532,7 +532,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(half2 *qk_buf_,
 }
 
 template <typename T>
-inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
+inline void MatMulWithHeadQK(const phi::GPUContext &context,
                              int head_num,
                              int seq_len,
                              int size_per_head,
@@ -549,8 +549,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
   CBLAS_TRANSPOSE transB = !k_trans ? CblasNoTrans : CblasTrans;
 
   typedef typename CUDATypeTraits<T>::TYPE run_type;
-  auto blas =
-      phi::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, run_type>(context);
   auto stream = context.stream();
 
   blas.BatchedGEMM(transA,
@@ -625,7 +624,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
 }
 
 template <typename T>
-inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context,
+inline void MatMulWithHeadQKV(const phi::GPUContext &context,
                               int head_num,
                               int seq_len,
                               int size_per_head,
@@ -641,8 +640,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context,
   int k = head_num * size_per_head;
 
   typedef typename CUDATypeTraits<T>::TYPE run_type;
-  auto blas =
-      phi::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, run_type>(context);
   auto stream = context.stream();
   CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans;
@@ -663,17 +661,16 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context,
 }
 
 template <typename T>
-void MultiHeadGPUComputeFunctor<T>::operator()(
-    const platform::CUDADeviceContext &dev_ctx,
-    int batch,
-    int seq_len,
-    int head_num,
-    int head_size,
-    T *qkptr,
-    const T *bias_qk_ptr,
-    T *tptr,
-    T alpha,
-    T beta) {
+void MultiHeadGPUComputeFunctor<T>::operator()(const phi::GPUContext &dev_ctx,
+                                               int batch,
+                                               int seq_len,
+                                               int head_num,
+                                               int head_size,
+                                               T *qkptr,
+                                               const T *bias_qk_ptr,
+                                               T *tptr,
+                                               T alpha,
+                                               T beta) {
   auto stream = dev_ctx.stream();
   const int tsize = batch * head_num * seq_len * head_size;
 
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 55d3dd2c3e8..bc59e2fa1a3 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -93,7 +93,7 @@ class EmbEltwiseLayerNormFunctor {
 template <typename T>
 class MultiHeadGPUComputeFunctor {
  public:
-  void operator()(const platform::CUDADeviceContext &dev_ctx,
+  void operator()(const phi::GPUContext &dev_ctx,
                   int batch,
                   int seq_len,
                   int head_num,
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 298c2f4e5ef..11508fd2d1e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -23,9 +23,9 @@ namespace math {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class ConcatFunctor<platform::CUDADeviceContext, T> {
+class ConcatFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const std::vector<framework::Tensor>& input,
                   int axis,
                   framework::Tensor* output) {
@@ -39,9 +39,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
  * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
-class SplitFunctor<platform::CUDADeviceContext, T> {
+class SplitFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   int axis,
@@ -51,9 +51,9 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-#define DEFINE_FUNCTOR(type)                                       \
-  template class ConcatFunctor<platform::CUDADeviceContext, type>; \
-  template class SplitFunctor<platform::CUDADeviceContext, type>
+#define DEFINE_FUNCTOR(type)                           \
+  template class ConcatFunctor<phi::GPUContext, type>; \
+  template class SplitFunctor<phi::GPUContext, type>
 
 FOR_ALL_TYPES(DEFINE_FUNCTOR);
 
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 4f0fee91e59..ccbe1c2aeed 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -469,24 +469,18 @@ void TestConcatMain() {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-void TestConcatMain<paddle::platform::CUDADeviceContext,
-                    paddle::platform::CUDAPlace>() {
-  auto* context =
-      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace());
+void TestConcatMain<phi::GPUContext, paddle::platform::CUDAPlace>() {
+  auto* context = new phi::GPUContext(paddle::platform::CUDAPlace());
   context->SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CUDAPlace(), context->stream())
           .get());
   context->PartialInitWithAllocator();
 
-  ConcatCase1<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
-      context);
-  ConcatCase2<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
-      context);
-  ConcatCase3<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
-      context);
-  ConcatCase4<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(
-      context);
+  ConcatCase1<phi::GPUContext, paddle::platform::CUDAPlace>(context);
+  ConcatCase2<phi::GPUContext, paddle::platform::CUDAPlace>(context);
+  ConcatCase3<phi::GPUContext, paddle::platform::CUDAPlace>(context);
+  ConcatCase4<phi::GPUContext, paddle::platform::CUDAPlace>(context);
 
   delete context;
 }
@@ -495,7 +489,6 @@ void TestConcatMain<paddle::platform::CUDADeviceContext,
 TEST(math, concat) {
   TestConcatMain<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  TestConcatMain<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
+  TestConcatMain<phi::GPUContext, paddle::platform::CUDAPlace>();
 #endif
 }
diff --git a/paddle/fluid/operators/math/context_project.cu b/paddle/fluid/operators/math/context_project.cu
index f04b2d15349..70b3d67caf3 100644
--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
@@ -17,8 +17,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
-template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
+template class ContextProjectFunctor<phi::GPUContext, float>;
+template class ContextProjectFunctor<phi::GPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index 34aeabfac64..cbe76844519 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -50,8 +50,8 @@ __global__ void CosSimDyKernel(const T* x_norm,
 }
 
 template <typename T>
-struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
+struct CosSimDyFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& ctx,
                   const T* x_norm,
                   const T* y_norm,
                   const T* x,
@@ -69,8 +69,8 @@ struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
-template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+template struct CosSimDyFunctor<phi::GPUContext, float>;
+template struct CosSimDyFunctor<phi::GPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index a056341c3bf..61682a95c13 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -176,7 +176,7 @@ struct MatrixEighFunctor<phi::CPUContext, T> {
 // symmetric matrices on GPU, and uses the variable has_vectors
 // to control whether to return the eigenvectors.
 template <typename T>
-struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
+struct MatrixEighFunctor<phi::GPUContext, T> {
  public:
   void operator()(const framework::ExecutionContext &ctx,
                   const Tensor &input,
@@ -187,10 +187,9 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(ctx);
+        math::DeviceIndependenceTensorOperations<phi::GPUContext, T>(ctx);
     Tensor input_trans;
     input_trans = dito.Transpose(input);
     auto *input_vector = input_trans.data<T>();
@@ -324,34 +323,34 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
       m(paddle::platform::complex<float>, Che, cuComplex) \
           m(paddle::platform::complex<double>, Zhe, cuDoubleComplex)
 
-#define EVDBUFFER_INSTANCE(T, C, CastType)                                  \
-  template <>                                                               \
-  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::EvdBuffer( \
-      cusolverDnHandle_t handle,                                            \
-      cusolverEigMode_t jobz,                                               \
-      cublasFillMode_t uplo,                                                \
-      int n,                                                                \
-      const T *A,                                                           \
-      int lda,                                                              \
-      const ValueType *W,                                                   \
-      int *lwork) const {                                                   \
-    PADDLE_ENFORCE_GPU_SUCCESS(                                             \
-        platform::dynload::cusolverDn##C##evd_bufferSize(                   \
-            handle,                                                         \
-            jobz,                                                           \
-            uplo,                                                           \
-            n,                                                              \
-            reinterpret_cast<const CastType *>(A),                          \
-            lda,                                                            \
-            W,                                                              \
-            lwork));                                                        \
+#define EVDBUFFER_INSTANCE(T, C, CastType)                      \
+  template <>                                                   \
+  inline void MatrixEighFunctor<phi::GPUContext, T>::EvdBuffer( \
+      cusolverDnHandle_t handle,                                \
+      cusolverEigMode_t jobz,                                   \
+      cublasFillMode_t uplo,                                    \
+      int n,                                                    \
+      const T *A,                                               \
+      int lda,                                                  \
+      const ValueType *W,                                       \
+      int *lwork) const {                                       \
+    PADDLE_ENFORCE_GPU_SUCCESS(                                 \
+        platform::dynload::cusolverDn##C##evd_bufferSize(       \
+            handle,                                             \
+            jobz,                                               \
+            uplo,                                               \
+            n,                                                  \
+            reinterpret_cast<const CastType *>(A),              \
+            lda,                                                \
+            W,                                                  \
+            lwork));                                            \
   }
 
 FUNC_WITH_TYPES(EVDBUFFER_INSTANCE);
 
 #define EVD_INSTANCE(T, C, CastType)                                  \
   template <>                                                         \
-  inline void MatrixEighFunctor<platform::CUDADeviceContext, T>::Evd( \
+  inline void MatrixEighFunctor<phi::GPUContext, T>::Evd(             \
       cusolverDnHandle_t handle,                                      \
       cusolverEigMode_t jobz,                                         \
       cublasFillMode_t uplo,                                          \
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 44ce4f0d6d3..49aae2ebc1d 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -21,8 +21,8 @@ namespace operators {
 namespace math {
 
 template <typename T>
-struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
+struct GRUUnitFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext &context,
                       GRUMetaValue<T> value,
                       int frame_size,
                       int batch_size,
@@ -94,7 +94,7 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
       threads = dim3(32, 32);
       grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
                 false,
@@ -180,8 +180,8 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
-  static void compute(const platform::CUDADeviceContext &context,
+struct GRUUnitGradFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext &context,
                       GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad,
                       int frame_size,
@@ -230,7 +230,7 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
                                          origin_mode);
     }
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(context);
 
     if (value.prev_out_value && grad.prev_out_grad) {
       blas.GEMM(false,
@@ -324,10 +324,10 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
-template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitFunctor<phi::GPUContext, float>;
+template struct GRUUnitFunctor<phi::GPUContext, double>;
+template struct GRUUnitGradFunctor<phi::GPUContext, float>;
+template struct GRUUnitGradFunctor<phi::GPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index 93ee9d3a15b..09ec777ebb6 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -179,8 +179,7 @@ void testIm2col() {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-void testIm2col<paddle::platform::CUDADeviceContext,
-                paddle::platform::CUDAPlace>() {
+void testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor input;
   paddle::framework::Tensor output_cfo;
@@ -222,7 +221,7 @@ void testIm2col<paddle::platform::CUDADeviceContext,
   memcpy(input_ptr, arr, 6 * sizeof(float));
 
   auto* place = new paddle::platform::CUDAPlace();
-  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  auto* context = new phi::GPUContext(*place);
   context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*place, context->stream())
                             .get());
@@ -240,12 +239,12 @@ void testIm2col<paddle::platform::CUDADeviceContext,
   // Im2Col
   paddle::operators::math::Im2ColFunctor<
       paddle::operators::math::ColFormat::kCFO,
-      paddle::platform::CUDADeviceContext,
+      phi::GPUContext,
       float>
       im2col;
   paddle::operators::math::Im2ColFunctor<
       paddle::operators::math::ColFormat::kOCF,
-      paddle::platform::CUDADeviceContext,
+      phi::GPUContext,
       float>
       im2col_ocf;
 
@@ -283,12 +282,12 @@ void testIm2col<paddle::platform::CUDADeviceContext,
   // Col2Im: kCFO
   paddle::operators::math::Col2ImFunctor<
       paddle::operators::math::ColFormat::kCFO,
-      paddle::platform::CUDADeviceContext,
+      phi::GPUContext,
       float>
       col2im;
   paddle::operators::math::Col2ImFunctor<
       paddle::operators::math::ColFormat::kOCF,
-      paddle::platform::CUDADeviceContext,
+      phi::GPUContext,
       float>
       col2im_ocf;
   float col2im_data[] = {0, 2, 2, 3, 8, 5};
@@ -343,8 +342,7 @@ void testIm2col<paddle::platform::CUDADeviceContext,
 TEST(math, im2col) {
   testIm2col<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  testIm2col<paddle::platform::CUDADeviceContext,
-             paddle::platform::CUDAPlace>();
+  testIm2col<phi::GPUContext, paddle::platform::CUDAPlace>();
 #endif
 }
 
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 716989a7869..f18053e297e 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -124,15 +124,14 @@ int UniqSampler(const Sampler& sampler,
 }
 
 template <typename T>
-void GPUSampleWithProb<T>::operator()(
-    const platform::CUDADeviceContext& context,
-    const int seed,
-    const int dict_size,
-    const bool uniq,
-    const std::size_t num_samples,
-    const Tensor* L,
-    Tensor* S,
-    Tensor* P) {
+void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
+                                      const int seed,
+                                      const int dict_size,
+                                      const bool uniq,
+                                      const std::size_t num_samples,
+                                      const Tensor* L,
+                                      Tensor* S,
+                                      Tensor* P) {
   // UNDERSTAND: dimension issues
   const auto lbl_dim = L->dims();
   const int batch_size = lbl_dim[0];
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index bb5c2ef9799..1e8fb983a94 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -111,7 +111,7 @@ class SampleWithProb {
 template <typename T>
 class GPUSampleWithProb {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const int seed,
                   const int dict_size,
                   const bool uniq,
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index f09578a0b1c..7fa9dc27db9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -25,8 +25,8 @@ namespace paddle {
 namespace operators {
 namespace math {
 template <typename T>
-struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
+struct SelectedRowsAdd<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                   const phi::SelectedRows& input1,
                   const phi::SelectedRows& input2,
                   phi::SelectedRows* output) {
@@ -109,8 +109,8 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAdd<phi::GPUContext, float>;
+template struct SelectedRowsAdd<phi::GPUContext, double>;
 
 namespace {
 template <typename T, int block_size>
@@ -210,8 +210,8 @@ template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
 template struct SelectedRowsAddTensor<phi::GPUContext, platform::float16>;
 
 template <typename T>
-struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
+struct SelectedRowsAddTo<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                   const phi::SelectedRows& input1,
                   const int64_t input2_offset,
                   phi::SelectedRows* input2) {
@@ -259,12 +259,11 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;
+template struct SelectedRowsAddTo<phi::GPUContext, float>;
+template struct SelectedRowsAddTo<phi::GPUContext, double>;
+template struct SelectedRowsAddTo<phi::GPUContext, int>;
+template struct SelectedRowsAddTo<phi::GPUContext, int64_t>;
+template struct SelectedRowsAddTo<phi::GPUContext, platform::float16>;
 
 namespace {
 template <typename T, int block_size>
@@ -588,14 +587,14 @@ __global__ void UpdateToTensorKernel(const T* selected_rows,
 }
 
 template <typename T>
-struct UpdateToTensor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
+struct UpdateToTensor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                   const ScatterOps& op,
                   const phi::SelectedRows& input1,
                   framework::Tensor* input2) {
     // NOTE: Use SelectedRowsAddToTensor for better performance
     //       no additional MergeAdd called.
-    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    MergeAdd<phi::GPUContext, T> merge_func;
     auto merged_in1 = merge_func(context, input1);
 
     auto in1_height = merged_in1.height();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 877c3c63aff..746a64ff58c 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -20,10 +20,9 @@ limitations under the License. */
 TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
+  phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
+      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+  phi::funcs::SetConstant<phi::GPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -62,9 +61,7 @@ TEST(selected_rows_functor, gpu_add) {
   // simply concat two SelectedRows
   out_value->mutable_data<float>(phi::make_ddim({7, 10}), gpu_place);
 
-  paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
-                                           float>
-      add_functor;
+  paddle::operators::math::SelectedRowsAdd<phi::GPUContext, float> add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -108,9 +105,8 @@ TEST(selected_rows_functor, gpu_add) {
       new paddle::framework::Tensor()};
   tensor2->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
 
-  paddle::operators::math::
-      SelectedRowsAddTensor<paddle::platform::CUDADeviceContext, float>
-          add_tensor_functor;
+  paddle::operators::math::SelectedRowsAddTensor<phi::GPUContext, float>
+      add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   paddle::framework::Tensor tensor2_cpu;
@@ -137,10 +133,9 @@ TEST(selected_rows_functor, gpu_add) {
 TEST(selected_rows_functor, gpu_add_to) {
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
+  phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
+      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+  phi::funcs::SetConstant<phi::GPUContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -169,9 +164,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   // simply concat two SelectedRows
   out_value->mutable_data<float>(phi::make_ddim({7, 10}), gpu_place);
 
-  paddle::operators::math::
-      SelectedRowsAddTo<paddle::platform::CUDADeviceContext, float>
-          add_to_functor;
+  paddle::operators::math::SelectedRowsAddTo<phi::GPUContext, float>
+      add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -212,9 +206,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   tensor1->mutable_data<float>(phi::make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  paddle::operators::math::
-      SelectedRowsAddToTensor<paddle::platform::CUDADeviceContext, float>
-          add_to_tensor_functor;
+  paddle::operators::math::SelectedRowsAddToTensor<phi::GPUContext, float>
+      add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   paddle::framework::Tensor tensor1_cpu;
@@ -241,10 +234,9 @@ TEST(selected_rows_functor, gpu_add_to) {
 TEST(selected_rows_functor, gpu_merge_add) {
   paddle::platform::CUDAPlace gpu_place(0);
   paddle::platform::CPUPlace cpu_place;
-  paddle::platform::CUDADeviceContext& ctx =
-      *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
-          paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  phi::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> set_const;
+  phi::GPUContext& ctx = *reinterpret_cast<phi::GPUContext*>(
+      paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
+  phi::funcs::SetConstant<phi::GPUContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -269,9 +261,8 @@ TEST(selected_rows_functor, gpu_merge_add) {
 
   std::unique_ptr<phi::SelectedRows> output{new phi::SelectedRows()};
   output->set_height(height);
-  paddle::operators::math::scatter::
-      MergeAdd<paddle::platform::CUDADeviceContext, float>
-          merge_add_functor;
+  paddle::operators::math::scatter::MergeAdd<phi::GPUContext, float>
+      merge_add_functor;
 
   std::vector<const phi::SelectedRows*> inputs;
   inputs.push_back(selected_rows1.get());
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index 06eca480ec6..84944270f45 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -116,17 +116,15 @@ TEST(Seq2BatchPadding, CPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePadding, CUDA) {
   auto place = paddle::platform::CUDAPlace(0);
-  auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
+  auto *context = static_cast<phi::GPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(place));
 
   paddle::framework::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext, float>(
-      *context, lod1, 16);
+  TestSequencePadding<phi::GPUContext, float>(*context, lod1, 16);
 
   paddle::framework::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePadding<paddle::platform::CUDADeviceContext, float>(
-      *context, lod2, 128);
+  TestSequencePadding<phi::GPUContext, float>(*context, lod2, 128);
 }
 #endif
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 9ee3b107bea..a5edb1db95c 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -189,9 +189,9 @@ __global__ void sequence_pool_kernel(Range_OP op,
 }
 
 template <typename T>
-class SequencePoolFunctor<platform::CUDADeviceContext, T> {
+class SequencePoolFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const std::string pooltype,
                   T pad_value,
                   const framework::LoDTensor& input,
@@ -408,9 +408,9 @@ __global__ void sequence_pool_grad_kernel(Range_OP op,
 }
 
 template <typename T>
-class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
+class SequencePoolGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const std::string pooltype,
                   const framework::LoDTensor& out_grad,
                   framework::LoDTensor* in_grad,
@@ -493,10 +493,10 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
 };
 
 // sequence pooling
-template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
-template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
-template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
-template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
+template class SequencePoolFunctor<phi::GPUContext, float>;
+template class SequencePoolFunctor<phi::GPUContext, double>;
+template class SequencePoolGradFunctor<phi::GPUContext, float>;
+template class SequencePoolGradFunctor<phi::GPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index 63d922b7ebb..9cff64f7560 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -132,17 +132,15 @@ TEST(SequencePoolingGrad, CPU_SUM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePoolingGrad, CUDA_SUM) {
   auto place = paddle::platform::CUDAPlace(0);
-  auto *context = static_cast<paddle::platform::CUDADeviceContext *>(
+  auto *context = static_cast<phi::GPUContext *>(
       paddle::platform::DeviceContextPool::Instance().Get(place));
 
   paddle::framework::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext, float>(
-      *context, lod1, 128);
+  TestSequencePoolingSum<phi::GPUContext, float>(*context, lod1, 128);
 
   paddle::framework::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
-  TestSequencePoolingSum<paddle::platform::CUDADeviceContext, float>(
-      *context, lod2, 128);
+  TestSequencePoolingSum<phi::GPUContext, float>(*context, lod2, 128);
 }
 #endif
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index fd501d5188d..3aceceac32d 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -51,9 +51,9 @@ __global__ void tree2col(const T* eta,
   }
 }
 template <typename T>
-class Tree2ColFunctor<platform::CUDADeviceContext, T> {
+class Tree2ColFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const paddle::platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& EdgeSet,
                   const framework::Tensor& node_features,
                   framework::Tensor* patch,
@@ -63,7 +63,7 @@ class Tree2ColFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto feature_dims = node_features.dims();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
+    phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
@@ -128,9 +128,9 @@ class Tree2ColFunctor<platform::CUDADeviceContext, T> {
   }
 };
 template <typename T>
-class Col2TreeFunctor<platform::CUDADeviceContext, T> {
+class Col2TreeFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& EdgeSet,
                   const framework::Tensor& patch_grad,
                   framework::Tensor* embedding_grad,
@@ -140,7 +140,7 @@ class Col2TreeFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto output_dims = patch_grad.dims();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
+    phi::funcs::SetConstant<phi::GPUContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
@@ -214,10 +214,10 @@ class Col2TreeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template class Tree2ColFunctor<platform::CUDADeviceContext, float>;
-template class Tree2ColFunctor<platform::CUDADeviceContext, double>;
-template class Col2TreeFunctor<platform::CUDADeviceContext, float>;
-template class Col2TreeFunctor<platform::CUDADeviceContext, double>;
+template class Tree2ColFunctor<phi::GPUContext, float>;
+template class Tree2ColFunctor<phi::GPUContext, double>;
+template class Col2TreeFunctor<phi::GPUContext, float>;
+template class Col2TreeFunctor<phi::GPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index 5a776433199..253f4cb0279 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -111,9 +111,9 @@ __global__ void KernelUnpool3dMaxGrad(const int nthreads,
  */
 
 template <typename T>
-class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
+class Unpool2dMaxFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   framework::Tensor* output) {
@@ -148,9 +148,9 @@ class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
  * All tensors are in NCHW format.
  */
 template <typename T>
-class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
+class Unpool2dMaxGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -189,9 +189,9 @@ class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-class Unpool3dMaxFunctor<platform::CUDADeviceContext, T> {
+class Unpool3dMaxFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   framework::Tensor* output) {
@@ -230,9 +230,9 @@ class Unpool3dMaxFunctor<platform::CUDADeviceContext, T> {
  * All tensors are in NCDHW format.
  */
 template <typename T>
-class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, T> {
+class Unpool3dMaxGradFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& indices,
                   const framework::Tensor& output,
@@ -274,14 +274,14 @@ class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
-template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
-template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, float>;
-template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, double>;
-template class Unpool3dMaxFunctor<platform::CUDADeviceContext, float>;
-template class Unpool3dMaxFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxGradFunctor<phi::GPUContext, float>;
+template class Unpool2dMaxGradFunctor<phi::GPUContext, double>;
+template class Unpool2dMaxFunctor<phi::GPUContext, float>;
+template class Unpool2dMaxFunctor<phi::GPUContext, double>;
+template class Unpool3dMaxGradFunctor<phi::GPUContext, float>;
+template class Unpool3dMaxGradFunctor<phi::GPUContext, double>;
+template class Unpool3dMaxFunctor<phi::GPUContext, float>;
+template class Unpool3dMaxFunctor<phi::GPUContext, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index ec3926b95ee..c0c4ed5bb5d 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -132,15 +132,14 @@ void testVol2col() {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-void testVol2col<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>() {
+void testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>() {
   paddle::framework::Tensor input;
   paddle::framework::Tensor input_tmp;
   paddle::framework::Tensor output;
   paddle::framework::Tensor output_tmp;
 
   auto* place = new paddle::platform::CUDAPlace();
-  auto* context = new paddle::platform::CUDADeviceContext(*place);
+  auto* context = new phi::GPUContext(*place);
   context->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(*place, context->stream())
                             .get());
@@ -202,9 +201,7 @@ void testVol2col<paddle::platform::CUDADeviceContext,
                               output_width},
                              *place);
 
-  paddle::operators::math::Vol2ColFunctor<paddle::platform::CUDADeviceContext,
-                                          float>
-      vol2col;
+  paddle::operators::math::Vol2ColFunctor<phi::GPUContext, float> vol2col;
   vol2col(*context, input, dilations, strides, paddings, &output);
 
   float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
@@ -230,9 +227,7 @@ void testVol2col<paddle::platform::CUDADeviceContext,
     paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
 
-  paddle::operators::math::Col2VolFunctor<paddle::platform::CUDADeviceContext,
-                                          float>
-      col2vol;
+  paddle::operators::math::Col2VolFunctor<phi::GPUContext, float> col2vol;
   col2vol(*context, output, dilations, strides, paddings, &input);
 
   float* in_ptr;
@@ -256,7 +251,6 @@ void testVol2col<paddle::platform::CUDADeviceContext,
 TEST(math, vol2col) {
   testVol2col<phi::CPUContext, paddle::platform::CPUPlace>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  testVol2col<paddle::platform::CUDADeviceContext,
-              paddle::platform::CUDAPlace>();
+  testVol2col<phi::GPUContext, paddle::platform::CUDAPlace>();
 #endif  // PADDLE_WITH_CUDA
 }
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index c79073861ab..ff7ab502e8e 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -1055,20 +1055,17 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
-    ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MatMulKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MatMulKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>);
+    ops::MatMulKernel<phi::GPUContext, float>,
+    ops::MatMulKernel<phi::GPUContext, double>,
+    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
-                          paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    matmul_grad_grad,
-    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MatMulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::MatMulGradKernel<phi::GPUContext, float>,
+    ops::MatMulGradKernel<phi::GPUContext, double>,
+    ops::MatMulGradKernel<phi::GPUContext, paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(matmul_grad_grad,
+                        ops::MatMulDoubleGradKernel<phi::GPUContext, float>,
+                        ops::MatMulDoubleGradKernel<phi::GPUContext, double>);
 #endif
 
 REGISTER_OP_VERSION(matmul).AddCheckpoint(
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 1359bd62b49..08ab074718b 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -92,7 +92,7 @@ template <typename T>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto& place = *dev_ctx.eigen_device();
     // get input and output tensor
     auto* predictions = ctx.Input<Tensor>("Predictions");
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 8cd84f4b59e..b0513b0af84 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -40,8 +40,7 @@ class MemcpyH2DFunctor {
   void operator()(const framework::LoDTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto stream =
-        static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream();
+    auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
 #endif
diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
index 90d5fb3eaeb..16b9b5dc6bd 100644
--- a/paddle/fluid/operators/merge_selected_rows_op.cu.cc
+++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    merge_selected_rows,
-    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
-    ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(merge_selected_rows,
+                        ops::MergeSelectedRowsKernel<phi::GPUContext, float>,
+                        ops::MergeSelectedRowsKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 1e369c81538..310d28738fc 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -155,5 +155,4 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel<phi::CPUContext, float>);
 
-REGISTER_OP_CUDA_KERNEL(
-    minus, ops::MinusKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(minus, ops::MinusKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 9450b72c95f..67c3a5d90da 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -76,8 +76,7 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    modified_huber_loss,
-    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss,
+                        ops::ModifiedHuberLossKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
                         ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index b74c1fca088..01ca5d43090 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -58,7 +58,7 @@ class NCCLTester : public ::testing::Test {
     paddle::platform::CPUPlace cpu_place;
     for (size_t i = 0; i < gpu_list_.size(); ++i) {
       p::CUDAPlace place(i);
-      auto *ctx = new p::CUDADeviceContext(place);
+      auto *ctx = new phi::GPUContext(place);
       ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(place, ctx->stream())
                             .get());
@@ -184,7 +184,7 @@ void NCCLTester::testNcclAllReduceOp() {
     result_tensor->Resize(kDims);
     auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
-    auto *dev_ctx = static_cast<p::CUDADeviceContext *>(dev_ctxs_[i]);
+    auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[i]);
     paddle::memory::Copy(cpu_place,
                          ct,
                          p::CUDAPlace(gpu_list_[i]),
@@ -296,7 +296,7 @@ void NCCLTester::testNcclBcastOp() {
   result_tensor->Resize(kDims);
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
-  auto *dev_ctx = static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx]);
+  auto *dev_ctx = static_cast<phi::GPUContext *>(dev_ctxs_[idx]);
   paddle::memory::Copy(cpu_place,
                        ct,
                        p::CUDAPlace(gpu_list_[idx]),
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index 64f5bc9eab4..330163b1f93 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -92,8 +92,7 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
 
     int64_t batch_size = numbers->numel();
     auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
 
     framework::DDim out_dims = phi::make_ddim({upper_range});
     auto out_data = number_count->mutable_data<T>(out_dims, place);
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 6dfc4a7d13c..85594ff0574 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -99,7 +99,6 @@ class OneHotCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    one_hot,
-    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(one_hot,
+                        ops::OneHotCUDAKernel<phi::GPUContext, int>,
+                        ops::OneHotCUDAKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index 3314e899a13..205eb2853a3 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -31,7 +31,7 @@ struct CastFunctor {
 };
 
 template <typename InT, typename OutT, int VecSize>
-static void VecCastKernel(const platform::CUDADeviceContext &ctx,
+static void VecCastKernel(const phi::GPUContext &ctx,
                           const InT *x,
                           OutT *y,
                           size_t n) {
@@ -53,7 +53,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx,
 }  // namespace details
 
 template <typename InT, typename OutT>
-static void LaunchCastKernel(const platform::CUDADeviceContext &ctx,
+static void LaunchCastKernel(const phi::GPUContext &ctx,
                              const InT *x,
                              OutT *y,
                              size_t n) {
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
index dc568802a2b..30825a6a329 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
@@ -14,6 +14,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    decayed_adagrad,
-    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(decayed_adagrad,
+                        ops::DecayedAdagradOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
index e7fdeb617de..7909d58a644 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
@@ -15,6 +15,5 @@
 #include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    dgc_momentum,
-    ops::DGCMomentumKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(dgc_momentum,
+                        ops::DGCMomentumKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 7b1397b7df6..e7d795ccc57 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -178,22 +178,21 @@ static size_t FillAlignmentPaddingInfo(std::vector<ParamGradInfo> *infos,
 }
 
 template <typename T>
-static T *TensorFillConstant(const platform::CUDADeviceContext &dev_ctx,
+static T *TensorFillConstant(const phi::GPUContext &dev_ctx,
                              framework::Tensor *tensor,
                              const framework::DDim &dims,
                              T value) {
   tensor->Resize(dims);
   auto *ptr = tensor->mutable_data<T>(dev_ctx.GetPlace());
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  phi::funcs::SetConstant<phi::GPUContext, T> set_constant;
   set_constant(dev_ctx, tensor, value);
   return ptr;
 }
 
-static framework::Tensor CastDataForInitedTensor(
-    const platform::CUDADeviceContext &dev_ctx,
-    framework::Tensor *origin,
-    framework::Tensor *fused_out,
-    size_t numel_offset) {
+static framework::Tensor CastDataForInitedTensor(const phi::GPUContext &dev_ctx,
+                                                 framework::Tensor *origin,
+                                                 framework::Tensor *fused_out,
+                                                 size_t numel_offset) {
   PADDLE_ENFORCE_EQ(origin->IsInitialized(),
                     true,
                     platform::errors::InvalidArgument(
@@ -338,12 +337,12 @@ static T ClipByBound(T x, T low_value, T high_value) {
 }
 
 template <typename T>
-class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
+class DistributedFusedLambInitOpKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     VLOG(10) << "starts to run DistributedFusedLambInitOp";
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto place = ctx.GetPlace();
     auto stream = dev_ctx.stream();
 
@@ -790,4 +789,4 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     distributed_fused_lamb_init,
-    ops::DistributedFusedLambInitOpKernel<plat::CUDADeviceContext, float>);
+    ops::DistributedFusedLambInitOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index f8d55ff9cf7..394e49dd529 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -242,8 +242,7 @@ static void LogParamAndTrustRatioDivSquareNorm(
   }
 }
 
-static bool IsFinite(const platform::CUDADeviceContext &dev_ctx,
-                     const float *ptr) {
+static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   auto stream = dev_ctx.stream();
   float cpu_value;
 #ifdef PADDLE_WITH_HIP
@@ -509,7 +508,7 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
 
 template <typename T, typename GradT>
 static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
-    const platform::CUDADeviceContext &dev_ctx,
+    const phi::GPUContext &dev_ctx,
     const int *offsets,
     int n,
     const T *param_p,
@@ -779,7 +778,7 @@ template <typename ParamT,
           int MaxTensorNumPerLaunch = 160,
           int MaxChunkNumPerLaunch = 780>
 static void MultiTensorUpdateLambParamAndBetaPows(
-    const platform::CUDADeviceContext &dev_ctx,
+    const phi::GPUContext &dev_ctx,
     const int *offsets,
     int n,
     const MasterT<ParamT> *trust_ratio_div,
@@ -898,7 +897,7 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
 }
 
 template <typename T1, typename T2>
-static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
+static void LaunchScaleKernel(const phi::GPUContext &dev_ctx,
                               const T1 *x,
                               const T2 *scale,
                               T1 *y,
@@ -925,7 +924,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
                                  size_t nranks,
                                  ncclComm_t comm,
                                  gpuStream_t stream,
-                                 const platform::CUDADeviceContext &dev_ctx,
+                                 const phi::GPUContext &dev_ctx,
                                  const T *scale = nullptr) {
   static_assert(std::is_same<T, float>::value ||
                     std::is_same<T, platform::float16>::value,
@@ -974,15 +973,14 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
 }
 
 template <typename T>
-static void NCCLReduceScatterWithScale(
-    const T *sendbuff,
-    T *recvbuff,
-    size_t recvcount,
-    size_t nranks,
-    ncclComm_t comm,
-    gpuStream_t stream,
-    const platform::CUDADeviceContext &dev_ctx,
-    const T *scale = nullptr) {
+static void NCCLReduceScatterWithScale(const T *sendbuff,
+                                       T *recvbuff,
+                                       size_t recvcount,
+                                       size_t nranks,
+                                       ncclComm_t comm,
+                                       gpuStream_t stream,
+                                       const phi::GPUContext &dev_ctx,
+                                       const T *scale = nullptr) {
   NCCLSumWithScaleBase<T, true>(
       sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
 }
@@ -994,7 +992,7 @@ static void NCCLAllReduceWithScale(const T *sendbuff,
                                    size_t nranks,
                                    ncclComm_t comm,
                                    gpuStream_t stream,
-                                   const platform::CUDADeviceContext &dev_ctx,
+                                   const phi::GPUContext &dev_ctx,
                                    const T *scale = nullptr) {
   NCCLSumWithScaleBase<T, false>(
       sendbuff, recvbuff, recvcount, nranks, comm, stream, dev_ctx, scale);
@@ -1104,7 +1102,7 @@ static std::string GetMinMaxStr(const T *x,
       true,
       platform::errors::InvalidArgument("Only support CUDAPlace currently."));
 
-  auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(place));
   auto stream = dev_ctx->stream();
 
@@ -1276,13 +1274,12 @@ static __global__ void ElementwiseAddWithCastCUDAKernel(const T1 *x,
 }
 
 template <typename T1, typename T2, typename T3>
-static void LaunchElementwiseAddWithCastKernel(
-    const platform::CUDADeviceContext &dev_ctx,
-    const T1 *x,
-    const T2 *y,
-    T3 *z,
-    int n,
-    gpuStream_t stream) {
+static void LaunchElementwiseAddWithCastKernel(const phi::GPUContext &dev_ctx,
+                                               const T1 *x,
+                                               const T2 *y,
+                                               T3 *z,
+                                               int n,
+                                               gpuStream_t stream) {
   int vec_size =
       std::min(std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)),
                GetChunkedVecSize(z, 0));
@@ -1300,12 +1297,12 @@ static void LaunchElementwiseAddWithCastKernel(
 }
 
 template <typename T>
-class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
+class DistributedFusedLambOpKernel<phi::GPUContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     auto place = dev_ctx.GetPlace();
 
@@ -2135,4 +2132,4 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     distributed_fused_lamb,
-    ops::DistributedFusedLambOpKernel<plat::CUDADeviceContext, float>);
+    ops::DistributedFusedLambOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cu b/paddle/fluid/operators/optimizers/ftrl_op.cu
index acf8e38ca0f..dbea7e4d51c 100644
--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cu
@@ -13,5 +13,4 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/ftrl_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(ftrl, ops::FTRLOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu
index a9f880fdbb6..0d60979eef0 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/lamb_op.cu
@@ -17,7 +17,6 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     lamb,
-    ops::LambOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::LambOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LambOpKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LambOpKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::LambOpKernel<phi::GPUContext, float>,
+    ops::LambOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index d9aef74931a..5337e56b28d 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -419,25 +419,24 @@ __global__ void MomentumLarsKernel(const T* param,
 }
 
 template <typename T, typename MT>
-inline void SeparatedLarsMomentumOpCUDAKernel(
-    const platform::CUDADeviceContext& cuda_ctx,
-    const T* param_data,
-    T* param_out_data,
-    const MT* velocity_data,
-    MT* velocity_out_data,
-    const T* grad_data,
-    const MT* lr,
-    MT* p_buffer,
-    MT* g_buffer,
-    const MT mu,
-    const MT lars_coeff,
-    const MT weight_decay,
-    const MT epsilon,
-    const MT rescale_grad,
-    const int64_t numel,
-    const MT* master_param_data,
-    MT* master_out_data,
-    const bool is_amp) {
+inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
+                                              const T* param_data,
+                                              T* param_out_data,
+                                              const MT* velocity_data,
+                                              MT* velocity_out_data,
+                                              const T* grad_data,
+                                              const MT* lr,
+                                              MT* p_buffer,
+                                              MT* g_buffer,
+                                              const MT mu,
+                                              const MT lars_coeff,
+                                              const MT weight_decay,
+                                              const MT epsilon,
+                                              const MT rescale_grad,
+                                              const int64_t numel,
+                                              const MT* master_param_data,
+                                              MT* master_out_data,
+                                              const bool is_amp) {
   LarsThreadConfig<T> lars_thread_config(numel);
   L2NormKernel<T, MT><<<lars_thread_config.grid_for_norm,
                         LARS_BLOCK_SIZE,
@@ -483,11 +482,10 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     int num_blocks_per_sm = 0;
     bool multi_precision = ctx.Attr<bool>("multi_precision");
-    auto& cuda_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& cuda_ctx = ctx.template device_context<phi::GPUContext>();
     int sm_num = cuda_ctx.GetSMCount();
-    framework::Tensor tmp_buffer_t =
-        ctx.AllocateTmpTensor<MT, platform::CUDADeviceContext>(
-            {LARS_BLOCK_SIZE << 1}, cuda_ctx);
+    framework::Tensor tmp_buffer_t = ctx.AllocateTmpTensor<MT, phi::GPUContext>(
+        {LARS_BLOCK_SIZE << 1}, cuda_ctx);
     auto* p_buffer = tmp_buffer_t.mutable_data<MT>(ctx.GetPlace());
     auto* g_buffer = p_buffer + LARS_BLOCK_SIZE;
 
@@ -684,7 +682,6 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     lars_momentum,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>);
+    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, float>,
+    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, double>,
+    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu
index be3f6d6c91a..6419e524f71 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu
@@ -20,5 +20,5 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     pow2_decay_with_linear_warmup,
-    ops::Pow2DecayWithLinearWarmupOpKernel<plat::CUDADeviceContext, double>,
-    ops::Pow2DecayWithLinearWarmupOpKernel<plat::CUDADeviceContext, float>);
+    ops::Pow2DecayWithLinearWarmupOpKernel<phi::GPUContext, double>,
+    ops::Pow2DecayWithLinearWarmupOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
index 591dead3b12..c338f4cc717 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_adagrad,
-    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(proximal_adagrad,
+                        ops::ProximalAdagradOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
index d556fa74f19..edc911134c7 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -13,6 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    proximal_gd,
-    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(proximal_gd,
+                        ops::ProximalGDOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 6fd49248db1..28ca7c6d8d3 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -65,8 +65,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-class SGDOpKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class SGDOpKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
index cbafefb34fd..d8f8e9749b8 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cu
@@ -19,7 +19,6 @@
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sparse_momentum,
-    ops::SparseMomentumOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SparseMomentumOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SparseMomentumOpKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::float16>);
+    ops::SparseMomentumOpKernel<phi::GPUContext, float>,
+    ops::SparseMomentumOpKernel<phi::GPUContext, double>,
+    ops::SparseMomentumOpKernel<phi::GPUContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 25dae1ec7f3..5ed217b2e60 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -508,8 +508,8 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
 
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
+    set_zero(context.template device_context<phi::GPUContext>(),
              d_in,
              static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index e523c93f5d1..254e8ebe5c5 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -259,17 +259,14 @@ REGISTER_OP_CPU_KERNEL(
     ops::PadConstantLikeGradKernel<phi::CPUContext, int>,
     ops::PadConstantLikeGradKernel<phi::CPUContext, int64_t>);
 
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(pad_constant_like,
+                        ops::PadConstantLikeKernel<phi::GPUContext, float>,
+                        ops::PadConstantLikeKernel<phi::GPUContext, double>,
+                        ops::PadConstantLikeKernel<phi::GPUContext, int>,
+                        ops::PadConstantLikeKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
+    ops::PadConstantLikeGradKernel<phi::GPUContext, int>,
+    ops::PadConstantLikeGradKernel<phi::GPUContext, int64_t>,
+    ops::PadConstantLikeGradKernel<phi::GPUContext, float>,
+    ops::PadConstantLikeGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index 7e365dbeb1d..f4d8f7083b0 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -101,7 +101,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
     int all_length = batch_size * out_batch_len;
 
     constexpr size_t theory_sm_threads = 1024;
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     auto sm_count = max_threads / theory_sm_threads;
@@ -171,8 +171,8 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
     auto grad_batch_len = partial_len * in_num;
     auto all_length = grad_batch_len * batch_size;
     // initialize
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto &place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
     for (size_t i = 0; i < outs.size(); ++i) {
       outs[i]->mutable_data<T>(ctx.GetPlace());
       auto dxt = framework::EigenVector<T>::Flatten(*outs[i]);
@@ -180,7 +180,7 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     constexpr size_t theory_sm_threads = 1024;
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     auto sm_count = max_threads / theory_sm_threads;
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index a8d0b145082..69517233bf3 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -94,7 +94,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     constexpr size_t theory_sm_threads = 1024;
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     auto sm_count = max_threads / theory_sm_threads;
@@ -163,8 +163,8 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     // initialize
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto &place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
     for (size_t i = 0; i < outs.size(); ++i) {
       outs[i]->mutable_data<T>(ctx.GetPlace());
       auto dxt = framework::EigenVector<T>::Flatten(*outs[i]);
@@ -180,7 +180,7 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
     auto out_num = outs.size();
 
     constexpr size_t theory_sm_threads = 1024;
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     auto stream = dev_ctx.stream();
     auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     auto sm_count = max_threads / theory_sm_threads;
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index 04249d37794..ac4666bb174 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -426,7 +426,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(prroi_pool,
                         ops::GPUPRROIPoolOpKernel<float>,
                         ops::GPUPRROIPoolOpKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    prroi_pool_grad,
-    ops::GPUPRROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GPUPRROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(prroi_pool_grad,
+                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, float>,
+                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 85d57974ede..3b626cd762e 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -130,4 +130,4 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
 
 REGISTER_OP_CUDA_KERNEL(
     prune_gate_by_capacity,
-    ops::PruneGateByCapacityCUDAKernel<plat::CUDADeviceContext, int64_t>);
+    ops::PruneGateByCapacityCUDAKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
index c8342e6d5d1..6fe0156c01a 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cu.cc
@@ -19,4 +19,4 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     distributed_lookup_table,
-    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
+    ops::DistributedLookupTableKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
index 5c4ae3bdcfe..bba442a630a 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cu.cc
@@ -19,5 +19,5 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     distributed_push_sparse,
-    ops::DistributedPushSparseKernel<plat::CUDADeviceContext, float>,
-    ops::DistributedPushSparseKernel<plat::CUDADeviceContext, double>);
+    ops::DistributedPushSparseKernel<phi::GPUContext, float>,
+    ops::DistributedPushSparseKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 73eb3f15092..d3f1d17e7a3 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -98,12 +98,11 @@ class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
-REGISTER_OP_CUDA_KERNEL(
-    send_and_recv,
-    ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SendAndRecvKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(send_and_recv,
+                        ops::SendAndRecvKernel<phi::GPUContext, float>,
+                        ops::SendAndRecvKernel<phi::GPUContext, double>,
+                        ops::SendAndRecvKernel<phi::GPUContext, int>,
+                        ops::SendAndRecvKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(send_and_recv,
                        ops::SendAndRecvKernel<phi::CPUContext, float>,
                        ops::SendAndRecvKernel<phi::CPUContext, double>,
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 8d0d2d3090c..9aef7051fa5 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -100,8 +100,7 @@ void InitTensorsOnClient(framework::Scope* scope,
   //    ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}),
   //    *place);
   // for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
-  auto stream =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
 
   auto micro_id_var =
       scope->Var("microbatch_id")->GetMutable<framework::LoDTensor>();
@@ -245,7 +244,7 @@ TEST(SENDANDRECV, GPU) {
 
   framework::Scope* scope = (*micro_scope)[0];
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 9255a5f164b..9c13934ccd4 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -223,21 +223,17 @@ REGISTER_OP_CPU_KERNEL(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     py_layer,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::float16>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::bfloat16>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::complex<float>>,
-    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::complex<double>>);
+    ops::PyLayerOpKernel<phi::GPUContext, float>,
+    ops::PyLayerOpKernel<phi::GPUContext, ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<phi::GPUContext, ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<phi::GPUContext, double>,
+    ops::PyLayerOpKernel<phi::GPUContext, int>,
+    ops::PyLayerOpKernel<phi::GPUContext, int64_t>,
+
+    ops::PyLayerOpKernel<phi::GPUContext, bool>,
+    ops::PyLayerOpKernel<phi::GPUContext, uint8_t>,
+    ops::PyLayerOpKernel<phi::GPUContext, int16_t>,
+    ops::PyLayerOpKernel<phi::GPUContext, int8_t>,
+    ops::PyLayerOpKernel<phi::GPUContext, ::paddle::platform::complex<float>>,
+    ops::PyLayerOpKernel<phi::GPUContext, ::paddle::platform::complex<double>>);
 #endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 24ae989532d..8ae18a56329 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -36,8 +36,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     bool compute_q;
     bool reduced_mode;
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = context.template device_context<phi::GPUContext>();
     const Tensor& x = *context.Input<Tensor>("X");
     Tensor& q = *context.Output<Tensor>("Q");
     Tensor& r = *context.Output<Tensor>("R");
@@ -69,8 +68,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
         size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     auto dito =
-        math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
-                                                 T>(context);
+        math::DeviceIndependenceTensorOperations<phi::GPUContext, T>(context);
 
     // Note: allocate temporary tensors because of lacking in-place operatios.
     // Prepare qr
@@ -94,7 +92,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
     auto qr_data = qr.mutable_data<T>(context.GetPlace());
     auto tau_data = tau.mutable_data<T>(context.GetPlace());
 
-    BatchedGeqrf<platform::CUDADeviceContext, T>(
+    BatchedGeqrf<phi::GPUContext, T>(
         dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, tau_stride);
 
     if (reduced_mode) {
@@ -114,16 +112,16 @@ class QrGPUKernel : public framework::OpKernel<T> {
       // Perform QRGQR for Q using the result from GEQRF
       // Transpose 'q' to retore the original row-major order
       if (reduced_mode) {
-        BatchedOrgqr<platform::CUDADeviceContext, T>(dev_ctx,
-                                                     batch_size,
-                                                     m,
-                                                     min_mn,
-                                                     min_mn,
-                                                     qr_data,
-                                                     m,
-                                                     tau_data,
-                                                     qr_stride,
-                                                     tau_stride);
+        BatchedOrgqr<phi::GPUContext, T>(dev_ctx,
+                                         batch_size,
+                                         m,
+                                         min_mn,
+                                         min_mn,
+                                         qr_data,
+                                         m,
+                                         tau_data,
+                                         qr_stride,
+                                         tau_stride);
         auto trans_q = dito.Transpose(qr);
         auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn});
         framework::TensorCopy(sliced_q, q.place(), &q);
@@ -142,29 +140,29 @@ class QrGPUKernel : public framework::OpKernel<T> {
                          qr_stride * sizeof(phi::dtype::Real<T>),
                          dev_ctx.stream());
           }
-          BatchedOrgqr<platform::CUDADeviceContext, T>(dev_ctx,
-                                                       batch_size,
-                                                       m,
-                                                       m,
-                                                       min_mn,
-                                                       new_qr_data,
-                                                       m,
-                                                       tau_data,
-                                                       new_qr_stride,
-                                                       tau_stride);
+          BatchedOrgqr<phi::GPUContext, T>(dev_ctx,
+                                           batch_size,
+                                           m,
+                                           m,
+                                           min_mn,
+                                           new_qr_data,
+                                           m,
+                                           tau_data,
+                                           new_qr_stride,
+                                           tau_stride);
           auto trans_q = dito.Transpose(new_qr);
           framework::TensorCopy(trans_q, q.place(), &q);
         } else {
-          BatchedOrgqr<platform::CUDADeviceContext, T>(dev_ctx,
-                                                       batch_size,
-                                                       m,
-                                                       m,
-                                                       min_mn,
-                                                       qr_data,
-                                                       m,
-                                                       tau_data,
-                                                       qr_stride,
-                                                       tau_stride);
+          BatchedOrgqr<phi::GPUContext, T>(dev_ctx,
+                                           batch_size,
+                                           m,
+                                           m,
+                                           min_mn,
+                                           qr_data,
+                                           m,
+                                           tau_data,
+                                           qr_stride,
+                                           tau_stride);
           auto trans_q = dito.Transpose(qr);
           auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m});
           framework::TensorCopy(sliced_q, q.place(), &q);
@@ -175,16 +173,15 @@ class QrGPUKernel : public framework::OpKernel<T> {
 };
 
 template <>
-void BatchedGeqrf<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    float* a,
-    int lda,
-    float* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<phi::GPUContext, float>(const phi::GPUContext& dev_ctx,
+                                          int batch_size,
+                                          int m,
+                                          int n,
+                                          float* a,
+                                          int lda,
+                                          float* tau,
+                                          int a_stride,
+                                          int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -227,16 +224,15 @@ void BatchedGeqrf<platform::CUDADeviceContext, float>(
 }
 
 template <>
-void BatchedGeqrf<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    double* a,
-    int lda,
-    double* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<phi::GPUContext, double>(const phi::GPUContext& dev_ctx,
+                                           int batch_size,
+                                           int m,
+                                           int n,
+                                           double* a,
+                                           int lda,
+                                           double* tau,
+                                           int a_stride,
+                                           int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -279,17 +275,16 @@ void BatchedGeqrf<platform::CUDADeviceContext, double>(
 }
 
 template <>
-void BatchedOrgqr<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    float* a,
-    int lda,
-    float* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<phi::GPUContext, float>(const phi::GPUContext& dev_ctx,
+                                          int batch_size,
+                                          int m,
+                                          int n,
+                                          int k,
+                                          float* a,
+                                          int lda,
+                                          float* tau,
+                                          int a_stride,
+                                          int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -333,17 +328,16 @@ void BatchedOrgqr<platform::CUDADeviceContext, float>(
 }
 
 template <>
-void BatchedOrgqr<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    double* a,
-    int lda,
-    double* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<phi::GPUContext, double>(const phi::GPUContext& dev_ctx,
+                                           int batch_size,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           double* a,
+                                           int lda,
+                                           double* tau,
+                                           int a_stride,
+                                           int tau_stride) {
   int lwork = 0;
 
   auto handle = dev_ctx.cusolver_dn_handle();
@@ -391,9 +385,8 @@ void BatchedOrgqr<platform::CUDADeviceContext, double>(
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel<float>, ops::QrGPUKernel<double>);
-REGISTER_OP_CUDA_KERNEL(
-    qr_grad,
-    ops::QrGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::QrGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(qr_grad,
+                        ops::QrGradKernel<phi::GPUContext, float>,
+                        ops::QrGradKernel<phi::GPUContext, double>);
 
 #endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index 93c688aa642..37ca11db3e3 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -24,8 +24,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx,
+struct ChannelDequantizeFunctorV2<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& dev_ctx,
                   const framework::Tensor* in,
                   const framework::Tensor* scale,
                   T max_range,
@@ -61,14 +61,14 @@ struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, float>;
-template struct ChannelDequantizeFunctorV2<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctorV2<phi::GPUContext, float>;
+template struct ChannelDequantizeFunctorV2<phi::GPUContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CUDA = paddle::platform::CUDADeviceContext;
+using CUDA = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(dequantize_linear,
                         ops::DeQuantizeLinearKernel<CUDA, float, float>,
                         ops::DeQuantizeLinearKernel<CUDA, int8_t, float>,
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
index 55f7615d0f1..8a10b96a6f0 100644
--- a/paddle/fluid/operators/random_crop_op.cu
+++ b/paddle/fluid/operators/random_crop_op.cu
@@ -16,7 +16,7 @@
 
 namespace ops = paddle::operators;
 template <typename T>
-using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
+using Kernel = ops::RandomCropKernel<phi::GPUContext, T>;
 REGISTER_OP_CUDA_KERNEL(random_crop,
                         Kernel<float>,
                         Kernel<int>,
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
index aee430b5057..253560d981d 100644
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -39,7 +39,7 @@ struct Random<phi::CPUContext> {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
-struct Random<platform::CUDADeviceContext> {
+struct Random<phi::GPUContext> {
   using Engine = thrust::minstd_rand;
 
   template <typename T>
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index 61e38fb00fc..0b8aaf2d970 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -60,8 +60,7 @@ class RandomRoutingOpCUDAKernel : public framework::OpKernel<T> {
     auto out = context.Output<LoDTensor>("Out");
 
     auto place = context.GetPlace();
-    const auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_ctx = context.template device_context<phi::GPUContext>();
     framework::TensorCopy(*topk_idx, place, out);
 
     size_t N = topk_idx->dims()[0];
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 7571fcae270..83f6f23f985 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -62,7 +62,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
 
     int block_matrix_row = max_rank * x_fea_dim;
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 
     int max_ins = std::max(ins_num, max_size);
 
@@ -83,8 +83,8 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
     auto ins_rank_eigen = framework::EigenVector<T>::Flatten(*ins_rank);
     auto out_eigen = framework::EigenVector<T>::Flatten(*Out);
 
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto &place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
 
     param_help_eigen.device(place) =
         param_help_eigen.constant(static_cast<T>(0));
@@ -135,7 +135,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
     int64_t strideA = block_matrix_row;
     int64_t strideB = block_matrix_row * para_col;
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     blas.BatchedGEMM(transA,
                      transB,
                      1,
@@ -176,9 +176,9 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
     auto rank_offset_dims = rank_offset->dims();
     auto max_rank = (rank_offset_dims[1] - 1) / 2;
     int block_matrix_row = max_rank * x_fea_dim;
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto &place =
+        *ctx.template device_context<phi::GPUContext>().eigen_device();
 
     int max_ins = std::max(ins_num, max_size);
     // initialize out grad
@@ -201,7 +201,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
     const T *ins_rank_data = ins_rank->data<T>();
     T *param_grad_data = param_grad.data<T>();
 
-    auto blas = phi::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     T alpha = 1;
     T beta = 0;
 
@@ -242,7 +242,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using GPUCtx = paddle::platform::CUDADeviceContext;
+using GPUCtx = phi::GPUContext;
 REGISTER_OP_CUDA_KERNEL(rank_attention,
                         ops::RankAttentionCUDAKernel<GPUCtx, float>,
                         ops::RankAttentionCUDAKernel<GPUCtx, double>);
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index edf82d00950..b353b2992ce 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -245,10 +245,7 @@ REGISTER_OP_CPU_KERNEL(rank_loss_grad,
                        ops::RankLossGradKernel<phi::CPUContext, float>);
 
 REGISTER_OP_CUDA_KERNEL(
-    rank_loss,
-    paddle::operators::RankLossKernel<paddle::platform::CUDADeviceContext,
-                                      float>);
+    rank_loss, paddle::operators::RankLossKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     rank_loss_grad,
-    paddle::operators::RankLossGradKernel<paddle::platform::CUDADeviceContext,
-                                          float>);
+    paddle::operators::RankLossGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index a36d51e42f5..b9c608b62e7 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -52,8 +52,8 @@ BufferedReader::BufferedReader(
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;
     compute_stream_ =
-        ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
-                                             .Get(place_)))
+        ((phi::GPUContext *)(platform::DeviceContextPool::Instance().Get(
+             place_)))
             ->stream();
     events_.resize(buffer_size);
     for (auto &event : events_) {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 95bb0610771..d7f153700cf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -31,7 +31,7 @@ template <typename Tx,
           template <typename>
           class ReduceOp,
           typename TransformOp>
-void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx,
+void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                       const framework::Tensor& x,
                       framework::Tensor* y,
                       const TransformOp& transform,
diff --git a/paddle/fluid/operators/renorm_op.cu b/paddle/fluid/operators/renorm_op.cu
index e022e128c7f..ea21b985e7f 100644
--- a/paddle/fluid/operators/renorm_op.cu
+++ b/paddle/fluid/operators/renorm_op.cu
@@ -161,8 +161,7 @@ class CUDARenormKernel : public framework::OpKernel<T> {
     std::vector<const framework::Tensor*> ins = {x};
     std::vector<framework::Tensor*> outs = {&pow_value};
     auto func = UnsignedPowFunctor<MT, T>(p);
-    const auto& cuda_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
+    const auto& cuda_ctx = context.template device_context<phi::GPUContext>();
 
     paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
         cuda_ctx, ins, &outs, func);
diff --git a/paddle/fluid/operators/repeat_interleave_op.cu b/paddle/fluid/operators/repeat_interleave_op.cu
index 3371134f344..07099c30271 100644
--- a/paddle/fluid/operators/repeat_interleave_op.cu
+++ b/paddle/fluid/operators/repeat_interleave_op.cu
@@ -88,8 +88,7 @@ class RepeatInterleaveCUDAKernel : public framework::OpKernel<T> {
     auto stride_dim = phi::stride(input_dim);
     int64_t stride = stride_dim[dim];
 
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = context.template device_context<phi::GPUContext>().stream();
 
     int repeats = context.Attr<int>("Repeats");
     framework::LoDTensor index;
@@ -218,8 +217,7 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
     int64_t numel = in_grad->numel();
     int64_t out_nums = output_grad->numel();
 
-    auto stream =
-        context.template device_context<platform::CUDADeviceContext>().stream();
+    auto stream = context.template device_context<phi::GPUContext>().stream();
 
     index_select_grad_init<T>
         <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
@@ -328,23 +326,16 @@ class RepeatInterleaveGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     repeat_interleave,
-    ops::RepeatInterleaveCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::RepeatInterleaveCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    ops::RepeatInterleaveCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    paddle::platform::float16>,
-    ops::RepeatInterleaveCUDAKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::RepeatInterleaveCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>);
+    ops::RepeatInterleaveCUDAKernel<phi::GPUContext, float>,
+    ops::RepeatInterleaveCUDAKernel<phi::GPUContext, double>,
+    ops::RepeatInterleaveCUDAKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::RepeatInterleaveCUDAKernel<phi::GPUContext, int>,
+    ops::RepeatInterleaveCUDAKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     repeat_interleave_grad,
-    ops::RepeatInterleaveGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::RepeatInterleaveGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::RepeatInterleaveGradCUDAKernel<paddle::platform::CUDADeviceContext,
+    ops::RepeatInterleaveGradCUDAKernel<phi::GPUContext, float>,
+    ops::RepeatInterleaveGradCUDAKernel<phi::GPUContext, double>,
+    ops::RepeatInterleaveGradCUDAKernel<phi::GPUContext,
                                         paddle::platform::float16>,
-    ops::RepeatInterleaveGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::RepeatInterleaveGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>);
+    ops::RepeatInterleaveGradCUDAKernel<phi::GPUContext, int>,
+    ops::RepeatInterleaveGradCUDAKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index b665cce0962..6a25e2c7902 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -428,7 +428,7 @@ class ReshapeKernel {
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeKernel(static_cast<const phi::GPUContext &>(dev_ctx),
                          *in,
                          pt_scalar_shape,
@@ -461,7 +461,7 @@ class ReshapeGradKernel {
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
           static_cast<const phi::GPUContext &>(dev_ctx), *d_out, d_x);
     }
@@ -491,7 +491,7 @@ class ReshapeDoubleGradKernel {
     }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto &dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+      auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::GPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 633811862d8..f69889f7f8f 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -323,8 +323,7 @@ __global__ void RowConvGradFilter(const T *in,
 }  // namespace
 
 template <typename T>
-class RowConvKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class RowConvKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *X = context.Input<LoDTensor>("X");
@@ -378,8 +377,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T>
-class RowConvGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class RowConvGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto *X = context.Input<LoDTensor>("X");
@@ -418,7 +416,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> zero;
 
     if (dFilter) {
       T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
@@ -494,8 +492,6 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    row_conv_grad,
-    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(row_conv, ops::RowConvKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(row_conv_grad,
+                        ops::RowConvGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/run_program_op.cu.cc b/paddle/fluid/operators/run_program_op.cu.cc
index 19cd354c18f..b3383434203 100644
--- a/paddle/fluid/operators/run_program_op.cu.cc
+++ b/paddle/fluid/operators/run_program_op.cu.cc
@@ -20,9 +20,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 /* see [Why use single type kernel] */
-REGISTER_OP_CUDA_KERNEL(
-    run_program,
-    ops::RunProgramOpKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    run_program_grad,
-    ops::RunProgramGradOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(run_program,
+                        ops::RunProgramOpKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(run_program_grad,
+                        ops::RunProgramGradOpKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 1fb9942b37a..d0d8af95a3f 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -145,7 +145,7 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 
     // UNDERSTAND: allocate memories for temporaries
     sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
 
     auto sampled_labels_data =
@@ -244,7 +244,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.cuda_device_context();
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu
index 71476fd802b..e96aafa3829 100644
--- a/paddle/fluid/operators/save_combine_op.cu
+++ b/paddle/fluid/operators/save_combine_op.cu
@@ -16,9 +16,8 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(save_combine,
+                        ops::SaveCombineOpKernel<phi::GPUContext, float>,
+                        ops::SaveCombineOpKernel<phi::GPUContext, double>,
+                        ops::SaveCombineOpKernel<phi::GPUContext, int>,
+                        ops::SaveCombineOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu
index 056894dbae1..03753b6e7e3 100644
--- a/paddle/fluid/operators/save_op.cu
+++ b/paddle/fluid/operators/save_op.cu
@@ -19,11 +19,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     save,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>);
+    ops::SaveOpKernel<phi::GPUContext, float>,
+    ops::SaveOpKernel<phi::GPUContext, double>,
+    ops::SaveOpKernel<phi::GPUContext, int>,
+    ops::SaveOpKernel<phi::GPUContext, uint8_t>,
+    ops::SaveOpKernel<phi::GPUContext, int8_t>,
+    ops::SaveOpKernel<phi::GPUContext, int64_t>,
+    ops::SaveOpKernel<phi::GPUContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index 9b1d7a27e58..be406db5056 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -53,6 +53,5 @@ class GPUSeedKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    seed,
-    paddle::operators::GPUSeedKernel<paddle::platform::CUDADeviceContext, int>);
+REGISTER_OP_CUDA_KERNEL(seed,
+                        paddle::operators::GPUSeedKernel<phi::GPUContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
index f2117a2f098..2374ec02e8f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc
@@ -18,21 +18,13 @@
 
 REGISTER_OP_CUDA_KERNEL(
     sequence_concat,
-    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
-                                       float>,
-    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
-                                       double>,
-    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
-                                       int>,
-    paddle::operators::SeqConcatKernel<paddle::platform::CUDADeviceContext,
-                                       int64_t>);
+    paddle::operators::SeqConcatKernel<phi::GPUContext, float>,
+    paddle::operators::SeqConcatKernel<phi::GPUContext, double>,
+    paddle::operators::SeqConcatKernel<phi::GPUContext, int>,
+    paddle::operators::SeqConcatKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_concat_grad,
-    paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    paddle::operators::SeqConcatGradKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>);
+    paddle::operators::SeqConcatGradKernel<phi::GPUContext, float>,
+    paddle::operators::SeqConcatGradKernel<phi::GPUContext, double>,
+    paddle::operators::SeqConcatGradKernel<phi::GPUContext, int>,
+    paddle::operators::SeqConcatGradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
index 600981b5e96..5939ede964c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_conv_grad,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(sequence_conv,
+                        ops::SequenceConvKernel<phi::GPUContext, float>,
+                        ops::SequenceConvKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(sequence_conv_grad,
+                        ops::SequenceConvGradKernel<phi::GPUContext, float>,
+                        ops::SequenceConvGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index 363c40ce26d..cacd777f17e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -66,9 +66,9 @@ static __global__ void sequence_expand_as_grad_kernel(
 }
 
 template <typename T>
-struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
+struct SequenceExpandAsFunctor<phi::GPUContext, T> {
   void operator()(
-      const platform::CUDADeviceContext &context,
+      const phi::GPUContext &context,
       const LoDTensor &x,
       const framework::Vector<size_t> &ref_lod, /*expand referenced lod*/
       LoDTensor *out) {
@@ -97,8 +97,8 @@ struct SequenceExpandAsFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
+struct SequenceExpandAsGradFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &context,
                   const LoDTensor &dout,
                   const framework::Vector<size_t> &ref_lod, /*expand based lod*/
                   LoDTensor *dx) {
@@ -133,17 +133,14 @@ struct SequenceExpandAsGradFunctor<platform::CUDADeviceContext, T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand_as,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_expand_as,
+                        ops::SequenceExpandAsKernel<phi::GPUContext, float>,
+                        ops::SequenceExpandAsKernel<phi::GPUContext, double>,
+                        ops::SequenceExpandAsKernel<phi::GPUContext, int>,
+                        ops::SequenceExpandAsKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand_as_grad,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
-                                    double>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandAsGradKernel<paddle::platform::CUDADeviceContext,
-                                    int64_t>);
+    ops::SequenceExpandAsGradKernel<phi::GPUContext, float>,
+    ops::SequenceExpandAsGradKernel<phi::GPUContext, double>,
+    ops::SequenceExpandAsGradKernel<phi::GPUContext, int>,
+    ops::SequenceExpandAsGradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index 5ba02527825..f6e082f4d2a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -96,7 +96,7 @@ void GetOutputOffset(const framework::Vector<size_t>& x_lod,
 }
 
 template <typename T>
-static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
+static int ExpandByMemoryCopy(const phi::GPUContext& context,
                               const LoDTensor& x,
                               LoDTensor* out,
                               const framework::Vector<size_t>& x_lod,
@@ -142,9 +142,9 @@ static int ExpandByMemoryCopy(const platform::CUDADeviceContext& context,
 }
 
 template <typename T>
-struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+struct SequenceExpandFunctor<phi::GPUContext, T> {
   void operator()(
-      const platform::CUDADeviceContext& context,
+      const phi::GPUContext& context,
       const LoDTensor& x,
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
@@ -194,8 +194,8 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
+struct SequenceExpandGradFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                   const LoDTensor& dout,
                   const framework::Vector<size_t>& x_lod, /*expand source lod*/
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
@@ -228,16 +228,14 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_expand,
+                        ops::SequenceExpandKernel<phi::GPUContext, float>,
+                        ops::SequenceExpandKernel<phi::GPUContext, double>,
+                        ops::SequenceExpandKernel<phi::GPUContext, int>,
+                        ops::SequenceExpandKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::SequenceExpandGradKernel<phi::GPUContext, float>,
+    ops::SequenceExpandGradKernel<phi::GPUContext, double>,
+    ops::SequenceExpandGradKernel<phi::GPUContext, int>,
+    ops::SequenceExpandGradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
index e963ce610e2..b4284d2717a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cu
@@ -16,11 +16,7 @@
 
 REGISTER_OP_CUDA_KERNEL(
     sequence_mask,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          int>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          float>,
-    paddle::operators::SequenceMaskKernel<paddle::platform::CUDADeviceContext,
-                                          double>);
+    paddle::operators::SequenceMaskKernel<phi::GPUContext, int>,
+    paddle::operators::SequenceMaskKernel<phi::GPUContext, int64_t>,
+    paddle::operators::SequenceMaskKernel<phi::GPUContext, float>,
+    paddle::operators::SequenceMaskKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
index 7fc64a530ef..84a3e8da141 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cu
@@ -15,15 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pad,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequencePadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pad_grad,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequencePadGradOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_pad,
+                        ops::SequencePadOpKernel<phi::GPUContext, float>,
+                        ops::SequencePadOpKernel<phi::GPUContext, double>,
+                        ops::SequencePadOpKernel<phi::GPUContext, int>,
+                        ops::SequencePadOpKernel<phi::GPUContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_pad_grad,
+                        ops::SequencePadGradOpKernel<phi::GPUContext, float>,
+                        ops::SequencePadGradOpKernel<phi::GPUContext, double>,
+                        ops::SequencePadGradOpKernel<phi::GPUContext, int>,
+                        ops::SequencePadGradOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
index 4897474a485..882ec66f501 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
@@ -14,9 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool,
-    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    sequence_pool_grad,
-    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_pool,
+                        ops::SequencePoolKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_pool_grad,
+                        ops::SequencePoolGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
index 38bc599165d..eaf34643a07 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu
@@ -15,16 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_reshape_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reshape,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_reshape,
+                        ops::SequenceReshapeKernel<phi::GPUContext, float>,
+                        ops::SequenceReshapeKernel<phi::GPUContext, double>,
+                        ops::SequenceReshapeKernel<phi::GPUContext, int>,
+                        ops::SequenceReshapeKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_reshape_grad,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
+    ops::SequenceReshapeGradKernel<phi::GPUContext, float>,
+    ops::SequenceReshapeGradKernel<phi::GPUContext, double>,
+    ops::SequenceReshapeGradKernel<phi::GPUContext, int64_t>,
+    ops::SequenceReshapeGradKernel<phi::GPUContext, int>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
index 0a59ed7f9fe..810130669b5 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu
@@ -16,10 +16,9 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    sequence_reverse,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceReverseOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(sequence_reverse,
+                        ops::SequenceReverseOpKernel<phi::GPUContext, uint8_t>,
+                        ops::SequenceReverseOpKernel<phi::GPUContext, int>,
+                        ops::SequenceReverseOpKernel<phi::GPUContext, int64_t>,
+                        ops::SequenceReverseOpKernel<phi::GPUContext, float>,
+                        ops::SequenceReverseOpKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
index a4b0ea2e5b2..ecf39a07309 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cu
@@ -15,16 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_slice,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_slice,
+                        ops::SequenceSliceOpKernel<phi::GPUContext, float>,
+                        ops::SequenceSliceOpKernel<phi::GPUContext, double>,
+                        ops::SequenceSliceOpKernel<phi::GPUContext, int>,
+                        ops::SequenceSliceOpKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_slice_grad,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
+    ops::SequenceSliceGradOpKernel<phi::GPUContext, float>,
+    ops::SequenceSliceGradOpKernel<phi::GPUContext, double>,
+    ops::SequenceSliceGradOpKernel<phi::GPUContext, int>,
+    ops::SequenceSliceGradOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 58e99364f4f..b060aa9f08b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -61,10 +61,8 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
           phi::make_ddim({1UL, end_pos - start_pos});
       x_i.Resize(dims_i);
       out_i.Resize(dims_i);
-      math::SoftmaxCUDNNFunctor<T, platform::CUDADeviceContext>()(
-          ctx.template device_context<platform::CUDADeviceContext>(),
-          &x_i,
-          &out_i);
+      math::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
+          ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
     }
   }
 };
@@ -97,8 +95,8 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
       out_i.Resize(dims_i);
       out_grad_i.Resize(dims_i);
       x_grad_i.Resize(dims_i);
-      math::SoftmaxGradCUDNNFunctor<T, platform::CUDADeviceContext>()(
-          ctx.template device_context<platform::CUDADeviceContext>(),
+      math::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
+          ctx.template device_context<phi::GPUContext>(),
           &out_i,
           &out_grad_i,
           &x_grad_i);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index bb0ad26b51b..5417c20f3d4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -40,8 +40,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
     bool runtime_cudnn_support = false;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
       runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
     }
 #endif
@@ -149,8 +148,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
     bool runtime_cudnn_support = false;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
-      auto& dev_ctx =
-          ctx.template device_context<platform::CUDADeviceContext>();
+      auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
       runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
     }
 #endif
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 696f6e7ca88..360f9055519 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -121,8 +121,8 @@ __global__ void sequence_softmax_grad_kernel(const T *softmax_grad_data,
 }
 
 template <typename T>
-struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
+struct SequenceSoftmaxFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &context,
                   const LoDTensor &x,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
                   LoDTensor *out) {
@@ -146,8 +146,8 @@ struct SequenceSoftmaxFunctor<platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext &context,
+struct SequenceSoftmaxGradFunctor<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext &context,
                   const LoDTensor &dout,
                   const LoDTensor &out,
                   const framework::Vector<size_t> &ref_lod, /*referenced lod*/
@@ -177,12 +177,10 @@ struct SequenceSoftmaxGradFunctor<platform::CUDADeviceContext, T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(sequence_softmax,
+                        ops::SequenceSoftmaxKernel<phi::GPUContext, float>,
+                        ops::SequenceSoftmaxKernel<phi::GPUContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
+    ops::SequenceSoftmaxGradKernel<phi::GPUContext, float>,
+    ops::SequenceSoftmaxGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
index bf54f77f5b5..4124e17cb09 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu
@@ -15,16 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    sequence_unpad,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceUnpadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(sequence_unpad,
+                        ops::SequenceUnpadOpKernel<phi::GPUContext, float>,
+                        ops::SequenceUnpadOpKernel<phi::GPUContext, double>,
+                        ops::SequenceUnpadOpKernel<phi::GPUContext, int>,
+                        ops::SequenceUnpadOpKernel<phi::GPUContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_unpad_grad,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SequenceUnpadGradOpKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>);
+    ops::SequenceUnpadGradOpKernel<phi::GPUContext, float>,
+    ops::SequenceUnpadGradOpKernel<phi::GPUContext, double>,
+    ops::SequenceUnpadGradOpKernel<phi::GPUContext, int>,
+    ops::SequenceUnpadGradOpKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu
index 7803f407181..6b70b8d37d7 100644
--- a/paddle/fluid/operators/shuffle_batch_op.cu
+++ b/paddle/fluid/operators/shuffle_batch_op.cu
@@ -88,7 +88,7 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
 
     auto *shuffleidx_data = shuffleidx->mutable_data<int64_t>(ctx.GetPlace());
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
 #ifdef PADDLE_WITH_CUDA
     const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
 #else
@@ -106,8 +106,8 @@ class ShuffleBatchCUDAKernel : public framework::OpKernel<T> {
     auto *out_data = out->mutable_data<T>(ctx.GetPlace());
     ReorderFunctor<T, true> functor(
         x_data, shuffleidx_data, out_data, x_embed_size);
-    platform::ForRange<platform::CUDADeviceContext> for_range(
-        dev_ctx, elem_size * x_embed_size);
+    platform::ForRange<phi::GPUContext> for_range(dev_ctx,
+                                                  elem_size * x_embed_size);
     for_range(functor);
 
     auto *seed_out_data = seed_out->mutable_data<int64_t>(phi::make_ddim({1}),
@@ -136,10 +136,9 @@ class ShuffleBatchGradCUDAKernel : public framework::OpKernel<T> {
     auto x_embed_size = x_grad->dims()[x_grad->dims().size() - 1];
     ReorderFunctor<T, false> functor(
         out_grad_data, shuffleidx_data, x_grad_data, x_embed_size);
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto &dev_ctx = ctx.template device_context<phi::GPUContext>();
     // TODO(zengjinle): for small data, direct cudaMemcpy may be better
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x_grad->numel());
+    platform::ForRange<phi::GPUContext> for_range(dev_ctx, x_grad->numel());
     for_range(functor);
 #endif
   }
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 02c0cfdd969..f51724d8431 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -129,12 +129,9 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     shuffle_channel,
-    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ShuffleChannelOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                    double>);
+    ops::ShuffleChannelOpCUDAKernel<phi::GPUContext, float>,
+    ops::ShuffleChannelOpCUDAKernel<phi::GPUContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     shuffle_channel_grad,
-    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ShuffleChannelGradOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        double>);
+    ops::ShuffleChannelGradOpCUDAKernel<phi::GPUContext, float>,
+    ops::ShuffleChannelGradOpCUDAKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 4e812261883..f42ebbe0399 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -488,32 +488,24 @@ REGISTER_OP_CPU_KERNEL(
 
 REGISTER_OP_CUDA_KERNEL(
     slice,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::float16>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::complex<float>>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext,
-                     paddle::platform::complex<double>>);
+    ops::SliceKernel<phi::GPUContext, bool>,
+    ops::SliceKernel<phi::GPUContext, float>,
+    ops::SliceKernel<phi::GPUContext, double>,
+    ops::SliceKernel<phi::GPUContext, int>,
+    ops::SliceKernel<phi::GPUContext, int64_t>,
+    ops::SliceKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::SliceKernel<phi::GPUContext, paddle::platform::bfloat16>,
+    ops::SliceKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::SliceKernel<phi::GPUContext, paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     slice_grad,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::float16>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::bfloat16>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
+    ops::SliceGradKernel<phi::GPUContext, bool>,
+    ops::SliceGradKernel<phi::GPUContext, float>,
+    ops::SliceGradKernel<phi::GPUContext, double>,
+    ops::SliceGradKernel<phi::GPUContext, int>,
+    ops::SliceGradKernel<phi::GPUContext, int64_t>,
+    ops::SliceGradKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::SliceGradKernel<phi::GPUContext, paddle::platform::bfloat16>,
+    ops::SliceGradKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::SliceGradKernel<phi::GPUContext, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cu b/paddle/fluid/operators/smooth_l1_loss_op.cu
index e5df479090f..d57b96d0ec5 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
@@ -14,9 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(smooth_l1_loss,
+                        ops::SmoothL1LossKernel<phi::GPUContext, float>);
+REGISTER_OP_CUDA_KERNEL(smooth_l1_loss_grad,
+                        ops::SmoothL1LossGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu
index 5a9f7c288d1..f9df5a5f74b 100644
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ b/paddle/fluid/operators/space_to_depth_op.cu
@@ -17,16 +17,14 @@
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    space_to_depth,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SpaceToDepthKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(space_to_depth,
+                        ops::SpaceToDepthKernel<phi::GPUContext, float>,
+                        ops::SpaceToDepthKernel<phi::GPUContext, double>,
+                        ops::SpaceToDepthKernel<phi::GPUContext, int>,
+                        ops::SpaceToDepthKernel<phi::GPUContext, int64_t>);
 
-REGISTER_OP_CUDA_KERNEL(
-    space_to_depth_grad,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SpaceToDepthGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(space_to_depth_grad,
+                        ops::SpaceToDepthGradKernel<phi::GPUContext, float>,
+                        ops::SpaceToDepthGradKernel<phi::GPUContext, double>,
+                        ops::SpaceToDepthGradKernel<phi::GPUContext, int>,
+                        ops::SpaceToDepthGradKernel<phi::GPUContext, int64_t>);
diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu
index 423ec727108..8bf431e59f0 100644
--- a/paddle/fluid/operators/sparse_attention_op.cu
+++ b/paddle/fluid/operators/sparse_attention_op.cu
@@ -209,7 +209,7 @@ input: sparse C in CSR format (num_rows,num_rows)
 output: sparse C after softmax operation
 */
 template <typename DeviceContext, typename T>
-void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx,
+void SparseSoftmaxForward(const phi::GPUContext& ctx,
                           const Tensor* offset,
                           const Tensor* columns,
                           Tensor* input,
@@ -322,7 +322,7 @@ void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx,
 }
 
 template <typename DeviceContext, typename T>
-void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx,
+void SparseSoftmaxBackward(const phi::GPUContext& ctx,
                            const Tensor* offset,
                            const Tensor* columns,
                            Tensor* dx,
@@ -453,7 +453,7 @@ input: dense A (num_rows,num_cols), dense B (num_rows,num_cols)
 output: sparse C in CSR format (num_rows,num_rows)
 */
 template <typename DeviceContext, typename T>
-void DotSdd(const platform::CUDADeviceContext& ctx,
+void DotSdd(const phi::GPUContext& ctx,
             const Tensor* a,
             const Tensor* b,
             const Tensor* c_offset,
@@ -546,7 +546,7 @@ input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols)
 output: dense C (num_rows,num_cols)
 */
 template <typename DeviceContext, typename T>
-void DotDsd(const platform::CUDADeviceContext& ctx,
+void DotDsd(const phi::GPUContext& ctx,
             const Tensor* a_offset,
             const Tensor* a_columns,
             const Tensor* a_value,
@@ -881,10 +881,10 @@ class SparseAttentionGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(
     sparse_attention,
-    ops::SparseAttentionCUDAKernel<plf::CUDADeviceContext, float>,
-    ops::SparseAttentionCUDAKernel<plf::CUDADeviceContext, double>);
+    ops::SparseAttentionCUDAKernel<plf::phi::GPUContext, float>,
+    ops::SparseAttentionCUDAKernel<plf::phi::GPUContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
     sparse_attention_grad,
-    ops::SparseAttentionGradCUDAKernel<plf::CUDADeviceContext, float>,
-    ops::SparseAttentionGradCUDAKernel<plf::CUDADeviceContext, double>);
+    ops::SparseAttentionGradCUDAKernel<plf::phi::GPUContext, float>,
+    ops::SparseAttentionGradCUDAKernel<plf::phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index f1134726998..661fcc83771 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -13,32 +13,26 @@
 #include "paddle/fluid/operators/spectral_op.cu.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fft_c2c,
-    ops::FFTC2CKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTC2CKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_c2c,
+                        ops::FFTC2CKernel<phi::GPUContext, float>,
+                        ops::FFTC2CKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fft_c2c_grad,
-    ops::FFTC2CGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTC2CGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_c2c_grad,
+                        ops::FFTC2CGradKernel<phi::GPUContext, float>,
+                        ops::FFTC2CGradKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fft_c2r,
-    ops::FFTC2RKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTC2RKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_c2r,
+                        ops::FFTC2RKernel<phi::GPUContext, float>,
+                        ops::FFTC2RKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fft_c2r_grad,
-    ops::FFTC2RGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTC2RGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_c2r_grad,
+                        ops::FFTC2RGradKernel<phi::GPUContext, float>,
+                        ops::FFTC2RGradKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fft_r2c,
-    ops::FFTR2CKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTR2CKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_r2c,
+                        ops::FFTR2CKernel<phi::GPUContext, float>,
+                        ops::FFTR2CKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    fft_r2c_grad,
-    ops::FFTR2CGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FFTR2CGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fft_r2c_grad,
+                        ops::FFTR2CGradKernel<phi::GPUContext, float>,
+                        ops::FFTR2CGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/spectral_op.cu.h b/paddle/fluid/operators/spectral_op.cu.h
index d7911d8ef18..5ced67691ee 100644
--- a/paddle/fluid/operators/spectral_op.cu.h
+++ b/paddle/fluid/operators/spectral_op.cu.h
@@ -907,8 +907,8 @@ static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
 }
 
 template <typename Ti, typename To>
-struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx,
+struct FFTC2CFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
                   const Tensor* X,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -934,7 +934,7 @@ struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
           std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
       first_dims.assign(working_axes.end() - max_dims, working_axes.end());
 
-      exec_fft<platform::CUDADeviceContext, Ti, To>(
+      exec_fft<phi::GPUContext, Ti, To>(
           ctx, p_working_tensor, p_out, first_dims, forward);
       working_axes.resize(working_axes.size() - max_dims);
       first_dims.clear();
@@ -945,14 +945,14 @@ struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
 
       std::swap(p_out, p_working_tensor);
     }
-    exec_normalization<platform::CUDADeviceContext, To>(
+    exec_normalization<phi::GPUContext, To>(
         ctx, p_out, out, normalization, out_dims, axes);
   }
 };
 
 template <typename Ti, typename To>
-struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx,
+struct FFTC2RFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
                   const Tensor* X,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -965,28 +965,27 @@ struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
       framework::Tensor x_copy(X->type());
       x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
       framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
-      exec_fft<platform::CUDADeviceContext, Ti, To>(
-          ctx, &x_copy, out, axes, forward);
+      exec_fft<phi::GPUContext, Ti, To>(ctx, &x_copy, out, axes, forward);
     } else {
       framework::Tensor temp_tensor;
       temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
       const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
 
-      FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
+      FFTC2CFunctor<phi::GPUContext, Ti, Ti> c2c_functor;
       c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
 
-      exec_fft<platform::CUDADeviceContext, Ti, To>(
+      exec_fft<phi::GPUContext, Ti, To>(
           ctx, &temp_tensor, out, {axes.back()}, forward);
     }
-    exec_normalization<platform::CUDADeviceContext, To>(
+    exec_normalization<phi::GPUContext, To>(
         ctx, out, out, normalization, out_dims, axes);
   }
 };
 
 // n dimension real to complex FFT use cufft lib
 template <typename Ti, typename To>
-struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
-  void operator()(const platform::CUDADeviceContext& ctx,
+struct FFTR2CFunctor<phi::GPUContext, Ti, To> {
+  void operator()(const phi::GPUContext& ctx,
                   const Tensor* X,
                   Tensor* out,
                   const std::vector<int64_t>& axes,
@@ -996,22 +995,21 @@ struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
     framework::Tensor* r2c_out = out;
     const std::vector<int64_t> last_dim{axes.back()};
     std::vector<int64_t> out_dims = phi::vectorize(out->dims());
-    exec_fft<platform::CUDADeviceContext, Ti, To>(
-        ctx, X, r2c_out, last_dim, forward);
+    exec_fft<phi::GPUContext, Ti, To>(ctx, X, r2c_out, last_dim, forward);
 
     // Step2: C2C transform on the remaining dimension
     framework::Tensor c2c_out;
     if (axes.size() > 1) {
       c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
       std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
-      FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
+      FFTC2CFunctor<phi::GPUContext, To, To> fft_c2c_func;
       fft_c2c_func(
           ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none, forward);
     }
 
     const auto in_sizes = phi::vectorize(X->dims());
     framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
-    exec_normalization<platform::CUDADeviceContext, To>(
+    exec_normalization<phi::GPUContext, To>(
         ctx, norm_tensor, out, normalization, in_sizes, axes);
   }
 };
diff --git a/paddle/fluid/operators/spp_op.cu.cc b/paddle/fluid/operators/spp_op.cu.cc
index f18efe4a035..24f4d65f661 100644
--- a/paddle/fluid/operators/spp_op.cu.cc
+++ b/paddle/fluid/operators/spp_op.cu.cc
@@ -15,11 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/spp_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    spp,
-    ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    spp_grad,
-    ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(spp,
+                        ops::SppKernel<phi::GPUContext, float>,
+                        ops::SppKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(spp_grad,
+                        ops::SppGradKernel<phi::GPUContext, float>,
+                        ops::SppGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
index 9cef47bd07e..c10cbfb42f1 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
@@ -14,10 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/squared_l2_distance_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    squared_l2_distance,
-    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance,
+                        ops::SquaredL2DistanceKernel<phi::GPUContext, float>);
 REGISTER_OP_CUDA_KERNEL(
     squared_l2_distance_grad,
-    ops::SquaredL2DistanceGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>);
+    ops::SquaredL2DistanceGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
index c7a96d03173..a77b369c403 100644
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -19,31 +19,27 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     squeeze,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<float>>,
-    ops::SqueezeKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex<double>>);
+    ops::SqueezeKernel<phi::GPUContext, float>,
+    ops::SqueezeKernel<phi::GPUContext, double>,
+    ops::SqueezeKernel<phi::GPUContext, plat::float16>,
+    ops::SqueezeKernel<phi::GPUContext, plat::bfloat16>,
+    ops::SqueezeKernel<phi::GPUContext, bool>,
+    ops::SqueezeKernel<phi::GPUContext, int>,
+    ops::SqueezeKernel<phi::GPUContext, uint8_t>,
+    ops::SqueezeKernel<phi::GPUContext, int8_t>,
+    ops::SqueezeKernel<phi::GPUContext, int64_t>,
+    ops::SqueezeKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::SqueezeKernel<phi::GPUContext, paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     squeeze_grad,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::complex<float>>,
-    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
-                           paddle::platform::complex<double>>);
+    ops::SqueezeGradKernel<phi::GPUContext, float>,
+    ops::SqueezeGradKernel<phi::GPUContext, double>,
+    ops::SqueezeGradKernel<phi::GPUContext, plat::float16>,
+    ops::SqueezeGradKernel<phi::GPUContext, plat::bfloat16>,
+    ops::SqueezeGradKernel<phi::GPUContext, bool>,
+    ops::SqueezeGradKernel<phi::GPUContext, int>,
+    ops::SqueezeGradKernel<phi::GPUContext, uint8_t>,
+    ops::SqueezeGradKernel<phi::GPUContext, int8_t>,
+    ops::SqueezeGradKernel<phi::GPUContext, int64_t>,
+    ops::SqueezeGradKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::SqueezeGradKernel<phi::GPUContext, paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/stft_op.cu b/paddle/fluid/operators/stft_op.cu
index 7bc3396064c..9edee0f66c5 100644
--- a/paddle/fluid/operators/stft_op.cu
+++ b/paddle/fluid/operators/stft_op.cu
@@ -17,12 +17,10 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    stft,
-    ops::StftKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::StftKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(stft,
+                        ops::StftKernel<phi::GPUContext, float>,
+                        ops::StftKernel<phi::GPUContext, double>);
 
-REGISTER_OP_CUDA_KERNEL(
-    stft_grad,
-    ops::StftGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::StftGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(stft_grad,
+                        ops::StftGradKernel<phi::GPUContext, float>,
+                        ops::StftGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index d65fc9ea808..350c3820a38 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -110,8 +110,7 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
-      auto& cuda_ctx =
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
+      auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(ctx);
       memory::Copy(gpu_place,
                    dst + i * dst_after,
                    gpu_place,
diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc
index e16df345427..3d8902a68ac 100644
--- a/paddle/fluid/operators/strided_memcpy_test.cc
+++ b/paddle/fluid/operators/strided_memcpy_test.cc
@@ -86,7 +86,7 @@ TEST(StridedMemcpy, GPUCrop) {
   platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
 
-  platform::CUDADeviceContext ctx(gpu0);
+  phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(gpu0, ctx.stream())
                        .get());
@@ -128,7 +128,7 @@ TEST(StridedMemcpy, GPUConcat) {
 
   platform::CUDAPlace gpu0(0);
   platform::CPUPlace cpu;
-  platform::CUDADeviceContext ctx(gpu0);
+  phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(gpu0, ctx.stream())
                        .get());
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 7b307413cd3..2cc17de1820 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -77,8 +77,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
   const size_t in_num = in_vars.size();
 
   constexpr size_t theory_sm_threads = 1024;
-  auto &dev_ctx =
-      context.template device_context<platform::CUDADeviceContext>();
+  auto &dev_ctx = context.template device_context<phi::GPUContext>();
   auto stream = dev_ctx.stream();
 
   auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
@@ -138,11 +137,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
 
   int start = in_place ? 1 : 0;
   if (!in_place) {
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(
-        context.template device_context<platform::CUDADeviceContext>(),
-        out,
-        static_cast<T>(0));
+    phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
+    constant_functor(context.template device_context<phi::GPUContext>(),
+                     out,
+                     static_cast<T>(0));
   }
 
   std::vector<const T *> in_data;
@@ -243,8 +241,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
 }
 
 template <typename T>
-class SumKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class SumKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     auto out_var = context.OutputVar("Out");
@@ -252,9 +249,9 @@ class SumKernel<platform::CUDADeviceContext, T>
     if (out_var->IsType<framework::LoDTensor>()) {
       SumToLoDTensor<T>(context);
     } else if (out_var->IsType<phi::SelectedRows>()) {
-      SelectedRowsCompute<platform::CUDADeviceContext, T>(context);
+      SelectedRowsCompute<phi::GPUContext, T>(context);
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
+      LodTensorArrayCompute<phi::GPUContext, T>(context);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Expected type of Output(out) must be Tensor,  SelectedRows or "
@@ -269,11 +266,10 @@ class SumKernel<platform::CUDADeviceContext, T>
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sum,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(sum,
+                        ops::SumKernel<phi::GPUContext, float>,
+                        ops::SumKernel<phi::GPUContext, double>,
+                        ops::SumKernel<phi::GPUContext, int>,
+                        ops::SumKernel<phi::GPUContext, int64_t>,
+                        ops::SumKernel<phi::GPUContext, plat::float16>,
+                        ops::SumKernel<phi::GPUContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/tensor_to_string.h b/paddle/fluid/operators/tensor_to_string.h
index f531df936cd..ef8a041fc5a 100644
--- a/paddle/fluid/operators/tensor_to_string.h
+++ b/paddle/fluid/operators/tensor_to_string.h
@@ -38,7 +38,7 @@ static std::vector<T> ToVector(const T *x,
     using CopyT = typename std::
         conditional<std::is_same<T, bool>::value, uint8_t, T>::type;
     std::vector<CopyT> cpu_x(n);
-    auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+    auto *dev_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     memory::Copy(platform::CPUPlace(),
                  cpu_x.data(),
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 1cd2683796a..b13996b6fab 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -471,8 +471,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     int runtime_batch = -1;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
+    auto stream = reinterpret_cast<const phi::GPUContext &>(dev_ctx).stream();
 
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 8e2b162babc..33ebaff8eab 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -40,7 +40,7 @@ void CreateCUDATensor(framework::Scope* scope,
   auto dims = phi::make_ddim(shape);
   tensor->Resize(dims);
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
@@ -142,7 +142,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
 
   framework::Scope scope;
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
@@ -171,7 +171,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CUDAPlace place;
-  platform::CUDADeviceContext ctx(place);
+  phi::GPUContext ctx(place);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(place, ctx.stream())
                        .get());
diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
index 6df883e8333..1162bf21592 100644
--- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
+++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h
@@ -96,10 +96,9 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim,
 
 #if defined(__NVCC__) || defined(__HIPCC__)
   if (platform::is_gpu_place(place)) {
-    auto &cuda_dev_ctx = dynamic_cast<platform::CUDADeviceContext &>(dev_ctx);
+    auto &cuda_dev_ctx = dynamic_cast<phi::GPUContext &>(dev_ctx);
     functor(cuda_dev_ctx, &x, out, &ddx, &ddout, dout, dx);
-    platform::ForRange<platform::CUDADeviceContext> for_range(cuda_dev_ctx,
-                                                              limit);
+    platform::ForRange<phi::GPUContext> for_range(cuda_dev_ctx, limit);
     for_range(actual_functor);
   } else {
 #endif
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index 390ed2b2ff3..4a038c93a1f 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -902,7 +902,7 @@ __global__ void AssignGradWithAxis(const T* grad_out,
 }
 // use the radix sort for the topk
 template <typename T>
-bool SortTopk(const platform::CUDADeviceContext& ctx,
+bool SortTopk(const phi::GPUContext& ctx,
               const framework::Tensor* input_tensor,
               const int64_t num_cols,
               const int64_t num_rows,
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 4910d1cf259..79236f590f7 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -157,26 +157,18 @@ class TopkOpGradCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(
     top_k,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext, float>,
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext, double>,
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext, int>,
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext, int64_t>,
+    paddle::operators::TopkOpCUDAKernel<phi::GPUContext,
                                         paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     top_k_grad,
-    paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                            float>,
-    paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                            double>,
-    paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                            int>,
-    paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t>,
-    paddle::operators::TopkOpGradCUDAKernel<paddle::platform::CUDADeviceContext,
+    paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, float>,
+    paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, double>,
+    paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, int>,
+    paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext, int64_t>,
+    paddle::operators::TopkOpGradCUDAKernel<phi::GPUContext,
                                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu
index 17d52cea1e0..1e4ca7bb838 100644
--- a/paddle/fluid/operators/tree_conv_op.cu
+++ b/paddle/fluid/operators/tree_conv_op.cu
@@ -15,11 +15,9 @@
 #include "paddle/fluid/operators/tree_conv_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    tree_conv,
-    ops::TreeConvKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TreeConvKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    tree_conv_grad,
-    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TreeConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(tree_conv,
+                        ops::TreeConvKernel<phi::GPUContext, float>,
+                        ops::TreeConvKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(tree_conv_grad,
+                        ops::TreeConvGradKernel<phi::GPUContext, float>,
+                        ops::TreeConvGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index febb093ed70..a3490937410 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -32,12 +32,11 @@ class GPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto dims = vectorize(dx->dims());
-    const auto& dev_cxt =
-        ctx.template device_context<platform::CUDADeviceContext>();
+    const auto& dev_cxt = ctx.template device_context<phi::GPUContext>();
     float value = static_cast<float>(0.0f);
     phi::FullKernel<T>(
         static_cast<const typename paddle::framework::ConvertToPhiContext<
-            paddle::platform::CUDADeviceContext>::TYPE&>(dev_cxt),
+            phi::GPUContext>::TYPE&>(dev_cxt),
         dims,
         value,
         phi::DataType::UNDEFINED,
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 0614e0920df..9f0f93f5573 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -150,8 +150,7 @@ template <typename T>
 void UniformRandom(const framework::ExecutionContext& context,
                    framework::Tensor* tensor) {
   int64_t size = tensor->numel();
-  auto& dev_cxt =
-      context.template device_context<platform::CUDADeviceContext>();
+  auto& dev_cxt = context.template device_context<phi::GPUContext>();
   T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
   if (size <= 0) return;
   unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/unpool_op.cu.cc b/paddle/fluid/operators/unpool_op.cu.cc
index 71c05658033..82890419daf 100644
--- a/paddle/fluid/operators/unpool_op.cu.cc
+++ b/paddle/fluid/operators/unpool_op.cu.cc
@@ -15,19 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/unpool_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    unpool,
-    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    unpool_grad,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    unpool3d,
-    ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unpool3dKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    unpool3d_grad,
-    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unpool3dGradKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(unpool,
+                        ops::UnpoolKernel<phi::GPUContext, float>,
+                        ops::UnpoolKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(unpool_grad,
+                        ops::UnpoolGradKernel<phi::GPUContext, float>,
+                        ops::UnpoolGradKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(unpool3d,
+                        ops::Unpool3dKernel<phi::GPUContext, float>,
+                        ops::Unpool3dKernel<phi::GPUContext, double>);
+REGISTER_OP_CUDA_KERNEL(unpool3d_grad,
+                        ops::Unpool3dGradKernel<phi::GPUContext, float>,
+                        ops::Unpool3dGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
index 598595ff28b..3a98a64d858 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -19,35 +19,30 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     unsqueeze,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
+    ops::UnsqueezeKernel<phi::GPUContext, float>,
+    ops::UnsqueezeKernel<phi::GPUContext, double>,
+    ops::UnsqueezeKernel<phi::GPUContext, plat::float16>,
+    ops::UnsqueezeKernel<phi::GPUContext, plat::bfloat16>,
+    ops::UnsqueezeKernel<phi::GPUContext, bool>,
+    ops::UnsqueezeKernel<phi::GPUContext, int>,
+    ops::UnsqueezeKernel<phi::GPUContext, int16_t>,
+    ops::UnsqueezeKernel<phi::GPUContext, uint8_t>,
+    ops::UnsqueezeKernel<phi::GPUContext, int8_t>,
+    ops::UnsqueezeKernel<phi::GPUContext, int64_t>,
+    ops::UnsqueezeKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::UnsqueezeKernel<phi::GPUContext, paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     unsqueeze_grad,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::float16>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
-                             plat::bfloat16>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
+    ops::UnsqueezeGradKernel<phi::GPUContext, float>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, double>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, plat::float16>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, plat::bfloat16>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, bool>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, int>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, int16_t>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, int8_t>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, uint8_t>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, int64_t>,
+    ops::UnsqueezeGradKernel<phi::GPUContext, paddle::platform::complex<float>>,
+    ops::UnsqueezeGradKernel<phi::GPUContext,
                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
index cd6c3a22e03..3ec89214a38 100644
--- a/paddle/fluid/platform/bfloat16_test.cu
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -67,7 +67,7 @@ TEST(bfloat16, lod_tensor_on_gpu) {
 
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
-  CUDADeviceContext gpu_ctx(gpu_place);
+  phi::GPUContext gpu_ctx(gpu_place);
   gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, gpu_ctx.stream())
                            .get());
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 00b5dd7f8af..2589aa9acd0 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -41,10 +41,10 @@ class NCCLCommImpl : public NCCLComm {
 
   gpuStream_t stream() const override { return dev_ctx_->stream(); }
 
-  void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
+  void set_dev_ctx(std::unique_ptr<phi::GPUContext>&& dev_ctx) {
     dev_ctx_ = std::move(dev_ctx);
   }
-  CUDADeviceContext* dev_context() const override { return dev_ctx_.get(); }
+  phi::GPUContext* dev_context() const override { return dev_ctx_.get(); }
 
   gpuEvent_t compute_event() const override { return compute_event_.get(); }
 
@@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm {
   int nranks_;
   int rank_;
   ncclComm_t comm_;
-  std::unique_ptr<CUDADeviceContext> dev_ctx_;
+  std::unique_ptr<phi::GPUContext> dev_ctx_;
 
   // used for comm wait compute, compute_stream-->event-->comm_stream
   std::shared_ptr<platform::CudaEventObject> compute_event_;
@@ -203,8 +203,8 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
 
 NCCLComm* NCCLCommContext::AssignNCCLComm(
     ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) {
-  std::unique_ptr<CUDADeviceContext> dev_ctx(
-      new CUDADeviceContext(CUDAPlace(dev_id)));
+  std::unique_ptr<phi::GPUContext> dev_ctx(
+      new phi::GPUContext(CUDAPlace(dev_id)));
   dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                             .GetAllocator(CUDAPlace(dev_id), dev_ctx->stream())
                             .get());
@@ -246,7 +246,7 @@ NCCLComm* NCCLCommContext::AssignNCCLComm(
   comm_map_mutex_.unlock();
 
   if (ring_id == 0) {
-    auto* dev_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* dev_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(
             platform::CUDAPlace(dev_id)));
     dev_ctx->set_nccl_comm(comm);
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 37065960828..207496d9f46 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -62,7 +62,7 @@ class NCCLComm {
   virtual gpuStream_t stream() const = 0;
   virtual gpuEvent_t compute_event() const = 0;
   virtual gpuEvent_t comm_event() const = 0;
-  virtual CUDADeviceContext* dev_context() const = 0;
+  virtual phi::GPUContext* dev_context() const = 0;
   virtual ~NCCLComm() = default;
 };
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index bfdf492962d..9f049b6e248 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -27,8 +27,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
                            cudaStreamCaptureMode mode,
                            int64_t pool_id) {
   auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  auto* dev_ctx =
-      reinterpret_cast<platform::CUDADeviceContext*>(mutable_dev_ctx);
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
 
   // After PR(#43206), cudnn related initializations will change to lazy mode.
@@ -66,8 +65,7 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
   auto place = CUDAGraph::CapturingPlace();
   auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  auto* dev_ctx =
-      reinterpret_cast<platform::CUDADeviceContext*>(mutable_dev_ctx);
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(mutable_dev_ctx);
   dev_ctx->cudnn_workspace_handle().ResetWorkspace();
   dev_ctx->SetCUDAGraphAllocator(nullptr);
   return CUDAGraph::EndCapture();
diff --git a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
index cc76a04a769..427901c1a7f 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cudnn_helper.h
@@ -621,7 +621,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
   use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
   if (use_cudnn) {
-    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
   }
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 507baf6c0f4..3628b7e0418 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -65,8 +65,8 @@ static inline int RoundToPowerOfTwo(int n) {
 #ifdef WITH_NV_JETSON
 // The number of threads cannot be assigned 1024 in some cases when the device
 // is nano or tx2 .
-template <typename CUDADeviceContext>
-inline void ChangeThreadNum(const CUDADeviceContext& context,
+template <typename phi::GPUContext>
+inline void ChangeThreadNum(const phi::GPUContext& context,
                             int* num_thread,
                             int alternative_num_thread = 512) {
   if (context.GetComputeCapability() == 53 ||
@@ -99,10 +99,9 @@ struct GpuLaunchConfig {
  * cuda performs better. And number of blocks should be greater (at least
  * 2x~4x) than number of SMs. Hence, SM count is took into account within
  * this function to determine the right number of threads per block. */
-inline GpuLaunchConfig GetGpuLaunchConfig1D(
-    const platform::CUDADeviceContext& context,
-    int64_t numel,
-    int vec_size = 1) {
+inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context,
+                                            int64_t numel,
+                                            int vec_size = 1) {
   PADDLE_ENFORCE_GE(numel,
                     0,
                     platform::errors::InvalidArgument(
@@ -146,8 +145,9 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
   return config;
 }
 
-inline GpuLaunchConfig GetGpuLaunchConfig2D(
-    const platform::CUDADeviceContext& context, int x_dim, int y_dim) {
+inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
+                                            int x_dim,
+                                            int y_dim) {
   PADDLE_ENFORCE_GT(
       x_dim,
       0,
@@ -182,8 +182,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(
 
 template <typename Context>
 void LimitGridDim(const Context& ctx, dim3* grid_dim) {
-  auto max_grid_dim = reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-                          .GetCUDAMaxGridDimSize();
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
   grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
   grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
   grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2];
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 1ce8038f0e3..a5d89f6001f 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -120,11 +120,11 @@ class NCCLGroupGuard {
 };
 
 struct NCCLContext {
-  std::unique_ptr<CUDADeviceContext> ctx_;
+  std::unique_ptr<phi::GPUContext> ctx_;
   ncclComm_t comm_;
 
   explicit NCCLContext(int dev_id) : comm_{nullptr} {
-    ctx_.reset(new CUDADeviceContext(CUDAPlace(dev_id)));
+    ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id)));
     ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(CUDAPlace(dev_id), ctx_->stream())
                            .get());
@@ -211,11 +211,9 @@ struct NCCLContextMap {
   NCCLContextMap(const NCCLContextMap &other) = delete;
   NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
 
-  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+  phi::GPUContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
-  CUDADeviceContext *DevCtx(platform::Place p) const {
-    return DevCtx(p.device);
-  }
+  phi::GPUContext *DevCtx(platform::Place p) const { return DevCtx(p.device); }
 
   const NCCLContext &at(platform::Place p) const { return this->at(p.device); }
 
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index ff7f64ef1be..9cb5cdfbb16 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -558,7 +558,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
   use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_HIP
   if (use_cudnn) {
-    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
   }
 #endif
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index bd8d3f8a372..f91b420be0d 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -252,7 +252,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
   }
 
   // Compile the program for specified compute_capability
-  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
   int compute_capability = dev_ctx->GetComputeCapability();
   std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
@@ -329,7 +329,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
   }
 
   // Compile the program for specified compute_capability
-  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
   int compute_capability = dev_ctx->GetComputeCapability();
   std::string compute_flag =
@@ -416,7 +416,7 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       max_blocks,
       (static_cast<int>(n) + workload_per_block - 1) / workload_per_block);
 
-  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 113862c6ec2..d38118d2a26 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -50,17 +50,16 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
 
   if (platform::is_gpu_place(place)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
+    auto* default_dev_ctx = static_cast<phi::GPUContext*>(
         platform::DeviceContextPool::Instance().Get(place));
-    auto& desired_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+    auto& desired_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
     if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
       return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
                                    size,
                                    phi::Stream(reinterpret_cast<phi::StreamId>(
                                        desired_dev_ctx.stream())));
     } else {
-      return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
+      return allocation::GPUContextAllocatorPool::Instance().Alloc(
           desired_dev_ctx, size);
     }
 #else
@@ -191,11 +190,11 @@ std::unique_ptr<DeviceContext> CreateDeviceContext(
   auto* dev_ctx = new DevCtx(p);
   if (is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* cuda_ctx = dynamic_cast<CUDADeviceContext*>(dev_ctx);
+    auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
         platform::errors::InvalidArgument(
-            "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
+            "Failed to dynamic_cast dev_ctx into phi::GPUContext."));
 
     auto& instance = memory::allocation::AllocatorFacade::Instance();
     if (!disable_setting_default_stream_for_allocator) {
@@ -271,7 +270,7 @@ void EmplaceDeviceContexts(
 #endif
     } else if (platform::is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      EmplaceDeviceContext<CUDADeviceContext>(
+      EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           p,
           disable_setting_default_stream_for_allocator);
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 70b979aa9bb..6d08a0cc32b 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -271,11 +271,9 @@ struct DefaultDeviceContextType<platform::NPUPinnedPlace> {
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
 
-using CUDADeviceContext = phi::GPUContext;
-
 class CudnnWorkspaceHandle {
  public:
-  inline CudnnWorkspaceHandle(const CUDADeviceContext& dev_ctx, std::mutex* mtx)
+  inline CudnnWorkspaceHandle(const phi::GPUContext& dev_ctx, std::mutex* mtx)
       : device_context_(dev_ctx), mtx_(mtx) {}
 
   template <typename Callback>
@@ -318,13 +316,13 @@ class CudnnWorkspaceHandle {
 
  private:
   memory::allocation::AllocationPtr allocation_;
-  const CUDADeviceContext& device_context_;
+  const phi::GPUContext& device_context_;
   std::mutex* mtx_;
 };
 
 template <>
 struct DefaultDeviceContextType<platform::CUDAPlace> {
-  using TYPE = CUDADeviceContext;
+  using TYPE = phi::GPUContext;
 };
 
 // Currently, CUDAPinnedDeviceContext is only used to data copying.
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 2db29dc11ad..abffa1e8846 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -19,13 +19,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, Init) {
-  using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
   using paddle::platform::DeviceContext;
+  using phi::GPUContext;
 
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i));
     device_context->SetAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(CUDAPlace(i), device_context->stream())
@@ -50,13 +50,13 @@ TEST(Device, Init) {
   }
 }
 
-TEST(Device, CUDADeviceContext) {
-  using paddle::platform::CUDADeviceContext;
+TEST(Device, GPUContext) {
   using paddle::platform::CUDAPlace;
+  using phi::GPUContext;
 
   int count = paddle::platform::GetGPUDeviceCount();
   for (int i = 0; i < count; i++) {
-    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    phi::GPUContext* device_context = new phi::GPUContext(CUDAPlace(i));
     device_context->SetAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(CUDAPlace(i), device_context->stream())
@@ -94,10 +94,10 @@ TEST(Device, CUDADeviceContext) {
 
 TEST(Device, DeviceContextPool) {
   using paddle::platform::CPUPlace;
-  using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
   using paddle::platform::DeviceContextPool;
   using paddle::platform::Place;
+  using phi::GPUContext;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   auto cpu_dev_ctx1 = pool.Get(CPUPlace());
diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu
index efb0d9ed756..14967edbe4e 100644
--- a/paddle/fluid/platform/device_context_test_cuda_graph.cu
+++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 
 TEST(Device, DeviceContextWithCUDAGraph) {
-  using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
   using paddle::platform::DeviceContext;
   using paddle::platform::DeviceContextPool;
   using paddle::platform::Place;
+  using phi::GPUContext;
 
   DeviceContextPool& pool = DeviceContextPool::Instance();
   Place place = CUDAPlace(0);
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index c9d9b6915b1..37da8daf7fd 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -49,12 +49,11 @@ void DeviceEventCreateCUDA(DeviceEvent* event,
 void DeviceEventRecordCUDA(DeviceEvent* event, const DeviceContext* context) {
   auto* wrapper = static_cast<CUDADeviceEventWrapper*>(event->GetEvent().get());
 
-  auto* cuda_dev_ctx =
-      dynamic_cast<const platform::CUDADeviceContext*>(context);
+  auto* cuda_dev_ctx = dynamic_cast<const phi::GPUContext*>(context);
   PADDLE_ENFORCE_NOT_NULL(
       cuda_dev_ctx,
       platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into CUDADeviceContext."));
+          "Failed to dynamic_cast context into phi::GPUContext."));
 
   wrapper->inner_event_.Record(cuda_dev_ctx->stream());
 }
@@ -78,12 +77,11 @@ void DeviceEventFinishCUDA(const DeviceEvent* event) {
 void DeviceEventCUDAWaitCUDA(const DeviceEvent* event,
                              const DeviceContext* context) {
   auto* wrapper = static_cast<CUDADeviceEventWrapper*>(event->GetEvent().get());
-  auto* cuda_dev_ctx =
-      dynamic_cast<const platform::CUDADeviceContext*>(context);
+  auto* cuda_dev_ctx = dynamic_cast<const phi::GPUContext*>(context);
   PADDLE_ENFORCE_NOT_NULL(
       cuda_dev_ctx,
       platform::errors::PreconditionNotMet(
-          "Failed to dynamic_cast context into CUDADeviceContext."));
+          "Failed to dynamic_cast context into phi::GPUContext."));
   // calling cudaStreamWaitEvent(stream, event, 0)
   cuda_dev_ctx->WaitEvent(wrapper->inner_event_.GetRawCudaEvent());
 }
diff --git a/paddle/fluid/platform/device_event_test.cc b/paddle/fluid/platform/device_event_test.cc
index 9fb423e782d..7dfacc66437 100644
--- a/paddle/fluid/platform/device_event_test.cc
+++ b/paddle/fluid/platform/device_event_test.cc
@@ -33,8 +33,7 @@ TEST(DeviceEvent, CUDA) {
 
   auto& pool = DeviceContextPool::Instance();
   auto place = CUDAPlace(0);
-  auto* context =
-      static_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+  auto* context = static_cast<phi::GPUContext*>(pool.Get(place));
 
   ASSERT_NE(context, nullptr);
   // case 1. test for event_creator
@@ -83,8 +82,7 @@ TEST(DeviceEvent, CUDA) {
 
   auto& pool = DeviceContextPool::Instance();
   auto place = CUDAPlace(0);
-  auto* context =
-      static_cast<paddle::platform::CUDADeviceContext*>(pool.Get(place));
+  auto* context = static_cast<phi::GPUContext*>(pool.Get(place));
 
   ASSERT_NE(context, nullptr);
   // case 1. test for event_creator
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index d91cb6da2dc..d6edb9ba947 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -328,7 +328,7 @@ TEST(float16, lod_tensor_on_gpu) {
 
   // CPU LoDTensor to GPU LoDTensor
   CUDAPlace gpu_place(0);
-  CUDADeviceContext gpu_ctx(gpu_place);
+  phi::GPUContext gpu_ctx(gpu_place);
   gpu_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, gpu_ctx.stream())
                            .get());
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 5e0717ba635..ce68452ffbe 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -40,9 +40,9 @@ using paddle::memory::Alloc;
 using paddle::memory::Copy;
 
 using paddle::platform::CPUPlace;
-using paddle::platform::CUDADeviceContext;
 using paddle::platform::CUDAPlace;
 using phi::CPUContext;
+using phi::GPUContext;
 
 using paddle::platform::Transform;
 
@@ -58,7 +58,7 @@ TEST(Transform, CPUUnary) {
 
 TEST(Transform, GPUUnary) {
   CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
+  phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(gpu0, ctx.stream())
                        .get());
@@ -67,7 +67,7 @@ TEST(Transform, GPUUnary) {
   auto gpu_allocation = Alloc(gpu0, sizeof(float) * 4);
   float* gpu_buf = static_cast<float*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<CUDADeviceContext> trans;
+  Transform<phi::GPUContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
@@ -89,7 +89,7 @@ TEST(Transform, CPUBinary) {
 TEST(Transform, GPUBinary) {
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
-  CUDADeviceContext ctx(gpu0);
+  phi::GPUContext ctx(gpu0);
   ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                        .GetAllocator(gpu0, ctx.stream())
                        .get());
@@ -97,7 +97,7 @@ TEST(Transform, GPUBinary) {
   auto gpu_allocation = Alloc(gpu0, sizeof(buf));
   int* gpu_buf = static_cast<int*>(gpu_allocation->ptr());
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<CUDADeviceContext> trans;
+  Transform<phi::GPUContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index df5b2c27122..f93e9b6de92 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1257,7 +1257,7 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use CUDAPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
-      auto* context = new paddle::platform::CUDADeviceContext(place);
+      auto* context = new phi::GPUContext(place);
       context->SetAllocator(
         paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(place, context->stream())
diff --git a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
index 5d06dddd964..9b5c24abc67 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h
@@ -243,9 +243,7 @@ void gpu_lstm_forward(const paddle::platform::DeviceContext& context,
     grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
-  auto stream =
-      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
-          .stream();
+  auto stream = reinterpret_cast<const phi::GPUContext&>(context).stream();
   if (batch_size == 1) {
     KeLstmForward<T,
                   Op,
@@ -297,9 +295,7 @@ void gpu_lstm_backward(const paddle::platform::DeviceContext& context,
     grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
   }
 
-  auto stream =
-      reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context)
-          .stream();
+  auto stream = reinterpret_cast<const phi::GPUContext&>(context).stream();
   if (batch_size == 1) {
     KeLstmBackward<T,
                    Op,
diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
index bbc3fdaeeac..164ee7e0b2a 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cu
+++ b/paddle/phi/kernels/funcs/gru_compute.cu
@@ -20,8 +20,8 @@ namespace phi {
 namespace funcs {
 
 template <typename T>
-struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
-  static void compute(const paddle::platform::CUDADeviceContext &context,
+struct GRUUnitFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext &context,
                       GRUMetaValue<T> value,
                       int frame_size,
                       int batch_size,
@@ -93,8 +93,7 @@ struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
       threads = dim3(32, 32);
       grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
                 false,
@@ -184,8 +183,8 @@ struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, T> {
 };
 
 template <typename T>
-struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
-  static void compute(const paddle::platform::CUDADeviceContext &context,
+struct GRUUnitGradFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext &context,
                       GRUMetaValue<T> value,
                       GRUMetaGrad<T> grad,
                       int frame_size,
@@ -236,8 +235,7 @@ struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
           origin_mode);
     }
 
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(context);
 
     if (value.prev_out_value && grad.prev_out_grad) {
       blas.GEMM(false,
@@ -333,10 +331,10 @@ struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
   }
 };
 
-template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, float>;
-template struct GRUUnitFunctor<paddle::platform::CUDADeviceContext, double>;
-template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
-template struct GRUUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+template struct GRUUnitFunctor<phi::GPUContext, float>;
+template struct GRUUnitFunctor<phi::GPUContext, double>;
+template struct GRUUnitGradFunctor<phi::GPUContext, float>;
+template struct GRUUnitGradFunctor<phi::GPUContext, double>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cu b/paddle/phi/kernels/funcs/lstm_compute.cu
index b2057cfc4f9..e3e8b6cc124 100644
--- a/paddle/phi/kernels/funcs/lstm_compute.cu
+++ b/paddle/phi/kernels/funcs/lstm_compute.cu
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/funcs/lstm_compute.h"
 #include "paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h"
 #include "paddle/phi/kernels/funcs/detail/lstm_kernel.h"
-#include "paddle/phi/kernels/funcs/lstm_compute.h"
 
 namespace phi {
 namespace funcs {
 
 template <class T>
-struct LstmUnitFunctor<paddle::platform::CUDADeviceContext, T> {
-  static void compute(const paddle::platform::CUDADeviceContext& context,
+struct LstmUnitFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext& context,
                       LstmMetaValue<T> value,
                       int frame_size,
                       int batch_size,
@@ -43,8 +43,8 @@ struct LstmUnitFunctor<paddle::platform::CUDADeviceContext, T> {
 };
 
 template <class T>
-struct LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
-  static void compute(const paddle::platform::CUDADeviceContext& context,
+struct LstmUnitGradFunctor<phi::GPUContext, T> {
+  static void compute(const phi::GPUContext& context,
                       LstmMetaValue<T> value,
                       LstmMetaGrad<T> grad,
                       int frame_size,
@@ -67,10 +67,10 @@ struct LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, T> {
   }
 };
 
-template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, float>;
-template class LstmUnitFunctor<paddle::platform::CUDADeviceContext, double>;
-template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, float>;
-template class LstmUnitGradFunctor<paddle::platform::CUDADeviceContext, double>;
+template class LstmUnitFunctor<phi::GPUContext, float>;
+template class LstmUnitFunctor<phi::GPUContext, double>;
+template class LstmUnitGradFunctor<phi::GPUContext, float>;
+template class LstmUnitGradFunctor<phi::GPUContext, double>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index bbd160e35c7..9f0c20ccf14 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -222,11 +222,10 @@ struct TensorSetConstantGPU {
 
   template <typename T>
   void apply() const {
-    SetConstant<paddle::platform::CUDADeviceContext, T> functor;
-    functor(
-        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context_),
-        tensor_,
-        static_cast<T>(value_));
+    SetConstant<phi::GPUContext, T> functor;
+    functor(reinterpret_cast<const phi::GPUContext&>(context_),
+            tensor_,
+            static_cast<T>(value_));
   }
 
   const paddle::platform::DeviceContext& context_;
@@ -255,8 +254,8 @@ __global__ void RowwiseAddKernel(
 }
 
 template <typename T>
-struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
-  void operator()(const paddle::platform::CUDADeviceContext& context,
+struct RowwiseAdd<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
                   const paddle::framework::Tensor& input,
                   const paddle::framework::Tensor& vector,
                   paddle::framework::Tensor* output) {
@@ -294,18 +293,18 @@ struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
   }
 };
 
-template struct RowwiseAdd<paddle::platform::CUDADeviceContext, float>;
-template struct RowwiseAdd<paddle::platform::CUDADeviceContext, double>;
-template struct ColwiseSum<paddle::platform::CUDADeviceContext, float>;
-template struct ColwiseSum<paddle::platform::CUDADeviceContext, int>;
-template struct ColwiseSum<paddle::platform::CUDADeviceContext, int64_t>;
-// template struct ColwiseSum<paddle::platform::CUDADeviceContext, double>;
-// The ColwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+template struct RowwiseAdd<phi::GPUContext, float>;
+template struct RowwiseAdd<phi::GPUContext, double>;
+template struct ColwiseSum<phi::GPUContext, float>;
+template struct ColwiseSum<phi::GPUContext, int>;
+template struct ColwiseSum<phi::GPUContext, int64_t>;
+// template struct ColwiseSum<phi::GPUContext, double>;
+// The ColwiseSum<phi::GPUContext, double> failed in debug
 // mode,
 // and only failed for this case. So reimplemented it.
 template <>
-void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
-    const paddle::platform::CUDADeviceContext& context,
+void ColwiseSum<phi::GPUContext, double>::operator()(
+    const phi::GPUContext& context,
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* vector) {
   auto in_dims = input.dims();
@@ -320,28 +319,28 @@ void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
                         vector->numel()));
   paddle::framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, double>(context)
-      .GEMV(true,
-            static_cast<int>(in_dims[0]),
-            static_cast<int>(in_dims[1]),
-            1.0,
-            input.data<double>(),
-            one.data<double>(),
-            0.0,
-            vector->data<double>());
+  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+      true,
+      static_cast<int>(in_dims[0]),
+      static_cast<int>(in_dims[1]),
+      1.0,
+      input.data<double>(),
+      one.data<double>(),
+      0.0,
+      vector->data<double>());
 }
 
-template struct RowwiseSum<paddle::platform::CUDADeviceContext, float>;
-// template struct RowwiseSum<paddle::platform::CUDADeviceContext, double>;
+template struct RowwiseSum<phi::GPUContext, float>;
+// template struct RowwiseSum<phi::GPUContext, double>;
 // TODO(zcd): Following ColwiseSum format, need to confirm.
-// The RowwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+// The RowwiseSum<phi::GPUContext, double> failed in debug
 // mode,
 // and only failed for this case. So reimplemented it.
 template <>
-void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
-    const paddle::platform::CUDADeviceContext& context,
+void RowwiseSum<phi::GPUContext, double>::operator()(
+    const phi::GPUContext& context,
     const paddle::framework::Tensor& input,
     paddle::framework::Tensor* vector) {
   auto in_dims = input.dims();
@@ -356,25 +355,25 @@ void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
                         vector->numel()));
   paddle::framework::Tensor one;
   one.mutable_data<double>({size}, context.GetPlace());
-  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  SetConstant<phi::GPUContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, double>(context)
-      .GEMV(true,
-            static_cast<int>(in_dims[1]),
-            static_cast<int>(in_dims[0]),
-            1.0,
-            one.data<double>(),
-            input.data<double>(),
-            0.0,
-            vector->data<double>());
+  phi::funcs::GetBlas<phi::GPUContext, double>(context).GEMV(
+      true,
+      static_cast<int>(in_dims[1]),
+      static_cast<int>(in_dims[0]),
+      1.0,
+      one.data<double>(),
+      input.data<double>(),
+      0.0,
+      vector->data<double>());
 }
 
-template struct RowwiseMean<paddle::platform::CUDADeviceContext, float>;
-template struct RowwiseMean<paddle::platform::CUDADeviceContext, double>;
+template struct RowwiseMean<phi::GPUContext, float>;
+template struct RowwiseMean<phi::GPUContext, double>;
 
 template <typename T>
-struct ElementwiseAddTo<paddle::platform::CUDADeviceContext, T> {
-  void operator()(paddle::platform::CUDADeviceContext* ctx,
+struct ElementwiseAddTo<phi::GPUContext, T> {
+  void operator()(phi::GPUContext* ctx,
                   const paddle::framework::Tensor& src,
                   paddle::framework::Tensor* dst) {
     auto in = paddle::framework::EigenVector<T>::Flatten(src);
@@ -384,10 +383,8 @@ struct ElementwiseAddTo<paddle::platform::CUDADeviceContext, T> {
   }
 };
 
-template struct ElementwiseAddTo<paddle::platform::CUDADeviceContext,
-                                 phi::dtype::float16>;
-template struct ElementwiseAddTo<paddle::platform::CUDADeviceContext,
-                                 phi::dtype::bfloat16>;
+template struct ElementwiseAddTo<phi::GPUContext, phi::dtype::float16>;
+template struct ElementwiseAddTo<phi::GPUContext, phi::dtype::bfloat16>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sequence2batch.cu b/paddle/phi/kernels/funcs/sequence2batch.cu
index a66030e6426..196ca7a2ef9 100644
--- a/paddle/phi/kernels/funcs/sequence2batch.cu
+++ b/paddle/phi/kernels/funcs/sequence2batch.cu
@@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src,
 }
 
 template <typename T>
-class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext, T> {
+class CopyMatrixRowsFunctor<phi::GPUContext, T> {
  public:
-  void operator()(const paddle::platform::CUDADeviceContext& context,
+  void operator()(const phi::GPUContext& context,
                   const paddle::framework::Tensor& src,
                   paddle::framework::Vector<size_t> index_lod,
                   paddle::framework::Tensor* dst,
@@ -90,19 +90,13 @@ class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext, T> {
   }
 };
 
-template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
-                                     float>;
-template class CopyMatrixRowsFunctor<paddle::platform::CUDADeviceContext,
-                                     double>;
+template class CopyMatrixRowsFunctor<phi::GPUContext, float>;
+template class CopyMatrixRowsFunctor<phi::GPUContext, double>;
 
-template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
-                                      float>;
-template class LoDTensor2BatchFunctor<paddle::platform::CUDADeviceContext,
-                                      double>;
-template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
-                                      float>;
-template class Batch2LoDTensorFunctor<paddle::platform::CUDADeviceContext,
-                                      double>;
+template class LoDTensor2BatchFunctor<phi::GPUContext, float>;
+template class LoDTensor2BatchFunctor<phi::GPUContext, double>;
+template class Batch2LoDTensorFunctor<phi::GPUContext, float>;
+template class Batch2LoDTensorFunctor<phi::GPUContext, double>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index e0b7bba50d6..657430e1e75 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -79,8 +79,7 @@ void TopkKernel(const Context& dev_ctx,
     // The conclusion is drawn from the data through multiple sets of
     // statistics
     if (input_width >= 128 && k >= input_width * 0.75) {
-      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
-          &dev_ctx);
+      auto* ctx = reinterpret_cast<const phi::GPUContext*>(&dev_ctx);
       if (ops::SortTopk<T>(*ctx,
                            input,
                            input_width,
@@ -131,9 +130,7 @@ void TopkKernel(const Context& dev_ctx,
         dev_ctx.template Alloc<T>(&sorted_output);
         dev_ctx.template Alloc<int64_t>(&sorted_indices);
         dev_ctx.template Alloc<int64_t>(&gather_indices);
-        auto* ctx =
-            reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
-                &dev_ctx);
+        auto* ctx = reinterpret_cast<const phi::GPUContext*>(&dev_ctx);
         if (ops::SortTopk<T>(*ctx,
                              out,
                              k,
@@ -239,8 +236,7 @@ void TopkKernel(const Context& dev_ctx,
     // The conclusion is drawn from the data through multiple sets of
     // statistics
     if (input_width >= 128 && k >= input_width * 0.75) {
-      auto* ctx = reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
-          &dev_ctx);
+      auto* ctx = reinterpret_cast<const phi::GPUContext*>(&dev_ctx);
       if (ops::SortTopk<T>(*ctx,
                            &trans_input,
                            input_width,
diff --git a/paddle/phi/tests/kernels/test_math_function.cu b/paddle/phi/tests/kernels/test_math_function.cu
index 853187fc802..479d874626a 100644
--- a/paddle/phi/tests/kernels/test_math_function.cu
+++ b/paddle/phi/tests/kernels/test_math_function.cu
@@ -37,9 +37,9 @@ void fill_fp16_data(phi::dtype::float16* in_ptr,
 }
 
 template <typename T>
-inline phi::funcs::BlasT<paddle::platform::CUDADeviceContext, T> GetBlas(
-    const paddle::platform::CUDADeviceContext& context) {
-  return phi::funcs::GetBlas<paddle::platform::CUDADeviceContext, T>(context);
+inline phi::funcs::BlasT<phi::GPUContext, T> GetBlas(
+    const phi::GPUContext& context) {
+  return phi::funcs::GetBlas<phi::GPUContext, T>(context);
 }
 
 TEST(math_function, notrans_mul_trans_fp32) {
@@ -51,7 +51,7 @@ TEST(math_function, notrans_mul_trans_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -87,7 +87,7 @@ TEST(math_function, notrans_mul_trans_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -134,7 +134,7 @@ TEST(math_function, trans_mul_notrans_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -176,7 +176,7 @@ TEST(math_function, trans_mul_notrans_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -229,7 +229,7 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -287,7 +287,7 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -364,7 +364,7 @@ TEST(math_function, gemm_trans_cublas_fp32) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -416,7 +416,7 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
@@ -485,7 +485,7 @@ void GemvTest(int m, int n, bool trans) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CUDAPlace gpu_place(0);
-  paddle::platform::CUDADeviceContext context(gpu_place);
+  phi::GPUContext context(gpu_place);
   context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(gpu_place, context.stream())
                            .get());
diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
index ffe89fde047..70919708e19 100644
--- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
+++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h
@@ -62,7 +62,7 @@ struct ReluFunctor {
 
 #if defined(__NVCC__) || defined(__HIPCC__)
     if (paddle::platform::is_gpu_place(place)) {
-      LAUNCH_RELU_KERNEL(paddle::platform::CUDADeviceContext);
+      LAUNCH_RELU_KERNEL(phi::GPUContext);
       return;
     }
 #endif
-- 
GitLab