diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index 22a059773f513f1a4a3aef0d3ca9b603fcd30bf8..eb536560b62d7993eae48aeaa1b09f7f0b5cbbf7 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -94,7 +94,7 @@ class NCCLOpHandleBase : public OpHandleBase {
         continue;
       }
 
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+      platform::SetDeviceId(dev_id);
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index eeff0f3d46d633c8f834dba96e0ada2e09dd86a0..240be51a442bec0184133ef4d175130ff67ec99e 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -47,7 +47,7 @@ void OpHandleBase::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   for (auto &p : dev_ctxes_) {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+    platform::SetDeviceId(dev_id);
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
   }
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index ed92e2e9aadb36b026334c1074bdd60d11beeb1b..8ba94f4fd7a79646ba69732371ed01456c6be41f 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -50,7 +50,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
+  platform::SetDeviceId(local_rank);
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
   return;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 1f7ea7ea04404712a29ef486004601f63510ea74..90b3e2c0e975bd6b5c6bf0de74c806a35009c5c5 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -339,7 +339,7 @@ void TensorRTEngine::freshDeviceId() {
                     platform::errors::OutOfRange(
                         "Device id %d exceeds the current device count: %d.",
                         device_id_, count));
-  cudaSetDevice(device_id_);
+  platform::SetDeviceId(device_id_);
 }
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 89853e159bde378ff1084ff656718c5f4316f051..c9fbaf351ea0077219e302e7d0cb513db089ed31 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -64,7 +64,7 @@ void MultiStreamCompute(float **data, float **second_data,
 
 TEST(Malloc, CUDADeviceContextMultiStream) {
   auto place = platform::CUDAPlace(0);
-  EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
+  platform::SetDeviceId(0);
 
   AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
   EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
@@ -94,7 +94,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
 
 TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   auto place = platform::CUDAPlace(0);
-  EXPECT_TRUE(cudaSuccess == cudaSetDevice(0));
+  platform::SetDeviceId(0);
 
   AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float));
   EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float));
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index d2d9b41fcce3a1c1cdc7cd6f142d4b77537dff63..08d70404a246e75a4584edb056d509e876d8c6f1 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -75,7 +75,7 @@ NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
           "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
 
   ncclComm_t comm = nullptr;
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
+  SetDeviceId(dev_id);
   PADDLE_ENFORCE_CUDA_SUCCESS(
       platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
 
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 2a6714c39a1cb4c435dd33a3ee9dd86fe561c1b6..f4c58920b8ee89e974accd44084e4cf40a7e96d9 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -226,7 +226,7 @@ void SetDeviceId(int id) {
                         "Device id must be less than GPU count, "
                         "but received id is: %d. GPU count is: %d.",
                         id, GetCUDADeviceCount()));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id));
+  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
 }
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index c2f4d6ff2fffbdb60dffa0ff4e48790d78c56369..e6c5f06c4c4b5475701d1cbedc7f92234fd2f6c6 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -132,7 +132,7 @@ struct NCCLContextMap {
           }
           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-          PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id));
+          SetDeviceId(gpu_id);
           PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
         }