From c8fac5ee3048749a41b2f21e041755bbaf18db1e Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 26 Feb 2021 10:22:04 +0800 Subject: [PATCH] [ROCM] update fluid framework for rocm (part5), test=develop (#31014) --- paddle/fluid/framework/garbage_collector.cc | 15 +++++++-- paddle/fluid/framework/garbage_collector.h | 6 ++-- paddle/fluid/framework/heter_service.h | 2 +- paddle/fluid/framework/heterbox_trainer.cc | 29 ++++++++++++----- paddle/fluid/framework/lod_tensor.h | 2 +- paddle/fluid/framework/lod_tensor_test.cu | 14 +++++++-- paddle/fluid/framework/mixed_vector.h | 2 +- paddle/fluid/framework/mixed_vector_test.cu | 26 ++++++++++++++- paddle/fluid/framework/op_registry.h | 2 +- paddle/fluid/framework/operator.cc | 6 +++- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 35 ++++++++++++++------- paddle/fluid/framework/parallel_executor.h | 2 +- paddle/fluid/framework/pipeline_trainer.cc | 2 +- paddle/fluid/framework/ps_gpu_trainer.cc | 3 +- paddle/fluid/framework/ps_gpu_worker.cc | 3 +- paddle/fluid/framework/pull_dense_worker.cc | 18 ++++++----- paddle/fluid/framework/save_load_util.cc | 2 +- paddle/fluid/framework/section_worker.cc | 12 +++++-- 19 files changed, 135 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 907b3413907..c8b6c764255 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -13,7 +13,7 @@ // limitations under the License. #include -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "gflags/gflags.h" @@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function &callback) { } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) {} @@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { platform::CUDADeviceGuard guard(place.device); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); +#endif callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } StreamGarbageCollector::~StreamGarbageCollector() { auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace()); platform::CUDADeviceGuard guard(place.device); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); +#endif } -cudaStream_t StreamGarbageCollector::stream() const { return stream_; } +gpuStream_t StreamGarbageCollector::stream() const { return stream_; } void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 9148d2f2520..97800865af8 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector { }; #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, @@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector { void Wait() const override; - cudaStream_t stream() const; + gpuStream_t stream() const; protected: void ClearCallback(const std::function &callback) override; private: - cudaStream_t stream_; + gpuStream_t stream_; std::unique_ptr callback_manager_; }; diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index a6687f9a650..8f52235c962 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -152,7 +152,7 @@ class HeterObjectPool { std::lock_guard lock(mutex_); if (pool_.empty()) { num_ += 1; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) VLOG(0) << "pool construct size: " << num_; #endif return std::make_shared(); diff --git a/paddle/fluid/framework/heterbox_trainer.cc b/paddle/fluid/framework/heterbox_trainer.cc index 3e55576b846..1f6dc39ae85 100644 --- a/paddle/fluid/framework/heterbox_trainer.cc +++ b/paddle/fluid/framework/heterbox_trainer.cc @@ -21,9 +21,10 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/trainer.h" -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ + defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif namespace paddle { @@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc, dataset->GetReaders(); for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDADeviceGuard guard(place.device); - cudaStream_t stream; + gpuStream_t stream; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); +#endif copy_streams_.push_back(stream); places_.push_back(place); - cudaEvent_t event; + gpuEvent_t event; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(&event, hipEventDisableTiming)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); +#endif events_.push_back(event); #endif #ifdef PADDLE_WITH_XPU @@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program, _ForEachDataType_(HeterMemcpyFunc); } } +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream)); + hipEventSynchronize(event); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); +#endif } place_ = place; } @@ -150,7 +165,7 @@ template void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor, const paddle::platform::Place& thread_place, - cudaStream_t stream) { + gpuStream_t stream) { T* thread_ptr = thread_tensor->mutable_data(root_tensor->dims(), thread_place); T* root_ptr = root_tensor->data(); @@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) { for (size_t i = 0; i < places_.size(); ++i) { pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope()); pull_dense_worker_->AddPlace(places_[i]); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) pull_dense_worker_->AddStream(copy_streams_[i]); #endif } diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index e09a628f491..b8911154e6b 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include #include -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include #include #endif diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu index d58cfe447e8..ddda7231887 100644 --- a/paddle/fluid/framework/lod_tensor_test.cu +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include #include #include "gtest/gtest.h" @@ -34,8 +32,14 @@ TEST(LoD, data) { auto& v = lod[0]; paddle::platform::CUDAPlace gpu(0); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu), + v.size()); + hipDeviceSynchronize(); +#else test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); cudaDeviceSynchronize(); +#endif for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(v[i], i * 2); } @@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) { auto lod = lod_tensor.lod(); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0, + lod[0].CUDAMutableData(place), lod[0].size()); + hipDeviceSynchronize(); +#else test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); cudaDeviceSynchronize(); +#endif for (size_t i = 0; i < src_lod[0].size(); ++i) { EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 3a6e80f718d..1e9b498bb2b 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -31,7 +31,7 @@ limitations under the License. */ namespace paddle { namespace framework { -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 4b0caa8d350..8fb59d682e4 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -12,7 +12,13 @@ See the License for the specific language governing permissions and limitations under the License. */ +#ifdef PADDLE_WITH_CUDA #include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif + #include #include "glog/logging.h" @@ -22,6 +28,7 @@ template using vec = paddle::framework::Vector; +using gpuStream_t = paddle::gpuStream_t; static __global__ void multiply_10(int* ptr) { for (int i = 0; i < 10; ++i) { @@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) { } } -cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { +gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { return reinterpret_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); @@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) { ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu(0); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu), + tmp.MutableData(gpu)); +#else multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); +#endif for (int i = 0; i < 10; ++i) { ASSERT_EQ(tmp[i], i * 10); @@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) { ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu0(0); paddle::platform::SetDeviceId(0); + +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu0), + tmp.MutableData(gpu0)); +#else multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); +#endif paddle::platform::CUDAPlace gpu1(1); auto* gpu1_ptr = tmp.MutableData(gpu1); paddle::platform::SetDeviceId(1); + +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu1), + gpu1_ptr); +#else multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); +#endif for (int i = 0; i < 10; ++i) { ASSERT_EQ(tmp[i], i * 100); } diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index e32ab8c7442..472c6f40826 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx is_persistable_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; #elif defined(PADDLE_WITH_XPU_BKCL) platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; @@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { } std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); @@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() { } void InitP2P(const std::vector &places) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::call_once(p2p_init_flag, [&]() { int count = places.size(); if (count <= 1) return; @@ -590,14 +590,24 @@ void InitP2P(const std::vector &places) { for (int j = 0; j < count; ++j) { if (devices[i] == devices[j]) continue; int can_acess = -1; +#ifdef PADDLE_WITH_HIP + hipError_t ret = + hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); + if (ret != hipSuccess || can_acess != 1) { +#else cudaError_t ret = cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]); if (ret != cudaSuccess || can_acess != 1) { +#endif LOG(WARNING) << "Cannot enable P2P access from " << devices[i] << " to " << devices[j]; } else { platform::CUDADeviceGuard guard(devices[i]); +#ifdef PADDLE_WITH_HIP + hipDeviceEnablePeerAccess(devices[j], 0); +#else cudaDeviceEnablePeerAccess(devices[j], 0); +#endif } } } @@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, BuildStrategy::ReduceStrategy::kAllReduce; member_->use_all_reduce_ = true; } -#if defined(PADDLE_WITH_CUDA) && defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( places.size(), 1, @@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } #endif -#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( places.size(), 1, @@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); // Initialize device context's nccl comm, will be used by normal @@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (member_->build_strategy_.async_mode_) { VLOG(3) << "use local async mode"; graph = member_->build_strategy_.Apply( @@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, final_graphs = async_graphs; } else if (member_->build_strategy_.enable_parallel_graph_) { VLOG(3) << "use ParallelSSAGraphExecutor"; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. bool is_inference = details::IsDataParallelInferenceGraph(*graph); @@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices( } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::vector buffers; buffers.reserve(member_->places_.size()); size_t numel = main_tensor.numel(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 0a1df2f1946..47de7dc48f4 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 01ab494adef..8d350f70165 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index bca1843dd8f..962f666478c 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -24,7 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) #include "paddle/fluid/platform/cuda_device_guard.h" namespace paddle { diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index d75a32a8802..1540679e00c 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) #include "paddle/fluid/platform/cuda_device_guard.h" #if defined _WIN32 || defined __APPLE__ diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index fb268e4b6cf..77d8abcd26e 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { current_version_[tid] = 0; } fleet_ptr_ = FleetWrapper::GetInstance(); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) copy_streams_.clear(); #endif -#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) places_.clear(); thread_scopes_.clear(); #endif } void PullDenseWorker::CreatePinVar() { -#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); @@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); LoDTensor* pin_tensor = ptr->GetMutable(); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); #endif @@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { exit(-1); } status_vec->resize(0); -#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) for (size_t i = 0; i < places_.size(); ++i) { // for (auto& v : dense_value_names_) { @@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); LoDTensor* tensor = var->GetMutable(); float* w = tensor->data(); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, platform::CUDAPinnedPlace(), pin_w, sizeof(float) * tensor->numel(), copy_streams_[i]); @@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) { uint64_t tid = static_cast( dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { -#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], &pull_dense_status_, false); diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc index bd5725f49c0..1731a974b71 100644 --- a/paddle/fluid/framework/save_load_util.cc +++ b/paddle/fluid/framework/save_load_util.cc @@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name, tensor->numel() * framework::SizeOfType(tensor->type()); auto* data_ptr = tensor->data(); if (platform::is_gpu_place(tensor->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) framework::Tensor temp; TensorCopySync(*tensor, platform::CPUPlace(), &temp); data_ptr = temp.data(); diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 6e17551818c..735c86faf08 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() { std::unique_ptr gc; auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_); if (max_memory_size >= 0) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector( @@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() { } } } +#ifdef PADDLE_WITH_RCCL + hipDeviceSynchronize(); +#else cudaDeviceSynchronize(); +#endif } // backward pass @@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() { } } } +#ifdef PADDLE_WITH_RCCL + hipDeviceSynchronize(); +#else cudaDeviceSynchronize(); +#endif } // update pass -- GitLab