diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index 681fb1fdb295ce01117fe9d46083f9282ab091dd..3c4f2c5bbc74ddd8b6291131d1fca5d925c3ed03 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -1049,7 +1049,7 @@ void GraphDataGenerator::AllocResource(const paddle::platform::Place &place,
   place_ = place;
   gpuid_ = place_.GetDeviceId();
   VLOG(3) << "gpuid " << gpuid_;
-  stream_ = dynamic_cast<platform::CUDADeviceContext *>(
+  stream_ = dynamic_cast<phi::GPUContext *>(
                 platform::DeviceContextPool::Instance().Get(place))
                 ->stream();
   feed_vec_ = feed_vec;
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index e57a02d72999c153b5a68279c5fe553593d9bd2f..f05fe6c95de0a5c63f45aed21daa2d4a39321f83 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -394,7 +394,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPullDedupImpl(
     const int* slot_dims,
     const uint32_t* gpu_restore_idx,
     int pull_value_size) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   size_t N = total_length * hidden_size;
@@ -428,7 +428,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
     const int* key2slot,
     const uint32_t* d_restore_idx,
     const size_t grad_value_size) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   cudaMemsetAsync(
@@ -470,7 +470,7 @@ void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
     const uint32_t* gpu_sort_offset,
     const uint32_t* gpu_sort_lens,
     const size_t grad_value_size) {
-  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   // merge all grad to one
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index bbeb5977635e9780317eaaf6bce365dd4ff43910..40597aed31f3d0ec04dc6ac7d7ea2b1d09cb9fb7 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -1130,7 +1130,7 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       VLOG(3) << "[" << device_id << "]Begin copy keys, key_num["
               << total_length << "] dedup mode";
 
-      auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+      auto stream = dynamic_cast<phi::GPUContext*>(
                         platform::DeviceContextPool::Instance().Get(place))
                         ->stream();
 
@@ -1399,7 +1399,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       VLOG(3) << "Begin push sparse, key_num[" << total_length
               << "] dedup mode, device:" << device_id << ", index"
               << devid_2_index;
-      auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+      auto stream = dynamic_cast<phi::GPUContext*>(
                         platform::DeviceContextPool::Instance().Get(place))
                         ->stream();
       uint64_t* total_keys = dev.keys_tensor.data<uint64_t>();
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 36b789bdd11084a75e9b68779b9fa69508146e26..7f27b6889fc9818708e41298f5c63868bc88d71b 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -128,7 +128,7 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             int slot_num,
                             int total_len,
                             int* key2slot) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+  auto stream = dynamic_cast<phi::GPUContext*>(
                     platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
   CopyKeysKernel2<<<CUDA_BLOCK(total_len), stream>>>(