diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index 22a059773f513f1a4a3aef0d3ca9b603fcd30bf8..eb536560b62d7993eae48aeaa1b09f7f0b5cbbf7 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -94,7 +94,7 @@ class NCCLOpHandleBase : public OpHandleBase { continue; } - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + platform::SetDeviceId(dev_id); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( &inter_events_[dev_id], cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index eeff0f3d46d633c8f834dba96e0ada2e09dd86a0..240be51a442bec0184133ef4d175130ff67ec99e 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -47,7 +47,7 @@ void OpHandleBase::InitCUDA() { #ifdef PADDLE_WITH_CUDA for (auto &p : dev_ctxes_) { int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + platform::SetDeviceId(dev_id); PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc index ed92e2e9aadb36b026334c1074bdd60d11beeb1b..8ba94f4fd7a79646ba69732371ed01456c6be41f 100644 --- a/paddle/fluid/framework/fleet/nccl_wrapper.cc +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -50,7 +50,7 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, nccl_info_.local_rank_ = local_rank; nccl_info_.my_global_rank_ = global_rank; nccl_info_.global_ranks_ = ranks; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank)); + platform::SetDeviceId(local_rank); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_))); #endif return; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 1f7ea7ea04404712a29ef486004601f63510ea74..90b3e2c0e975bd6b5c6bf0de74c806a35009c5c5 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -339,7 +339,7 @@ void TensorRTEngine::freshDeviceId() { platform::errors::OutOfRange( "Device id %d exceeds the current device count: %d.", device_id_, count)); - cudaSetDevice(device_id_); + platform::SetDeviceId(device_id_); } } // namespace tensorrt diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 89853e159bde378ff1084ff656718c5f4316f051..c9fbaf351ea0077219e302e7d0cb513db089ed31 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -64,7 +64,7 @@ void MultiStreamCompute(float **data, float **second_data, TEST(Malloc, CUDADeviceContextMultiStream) { auto place = platform::CUDAPlace(0); - EXPECT_TRUE(cudaSuccess == cudaSetDevice(0)); + platform::SetDeviceId(0); AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float)); EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float)); @@ -94,7 +94,7 @@ TEST(Malloc, CUDADeviceContextMultiStream) { TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { auto place = platform::CUDAPlace(0); - EXPECT_TRUE(cudaSuccess == cudaSetDevice(0)); + platform::SetDeviceId(0); AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float)); EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float)); diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index d2d9b41fcce3a1c1cdc7cd6f142d4b77537dff63..08d70404a246e75a4584edb056d509e876d8c6f1 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -75,7 +75,7 @@ NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, "Expected dev_id >= 0. But received dev_id is %d.", dev_id)); ncclComm_t comm = nullptr; - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + SetDeviceId(dev_id); PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank)); diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 2a6714c39a1cb4c435dd33a3ee9dd86fe561c1b6..f4c58920b8ee89e974accd44084e4cf40a7e96d9 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -226,7 +226,7 @@ void SetDeviceId(int id) { "Device id must be less than GPU count, " "but received id is: %d. GPU count is: %d.", id, GetCUDADeviceCount())); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(id)); + PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); } void GpuMemoryUsage(size_t *available, size_t *total) { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index c2f4d6ff2fffbdb60dffa0ff4e48790d78c56369..e6c5f06c4c4b5475701d1cbedc7f92234fd2f6c6 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -132,7 +132,7 @@ struct NCCLContextMap { } VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; - PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id)); + SetDeviceId(gpu_id); PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); }