From 50967135a503c61931a650bf8cdfe8cb64bde986 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 22 Feb 2021 20:34:08 +0800 Subject: [PATCH] [ROCM] update fluid framework for rocm (part3), test=develop (#31011) --- paddle/fluid/framework/details/CMakeLists.txt | 19 ++++++- .../details/eager_deletion_op_handle.cc | 27 +++++++--- .../details/eager_deletion_op_handle.h | 4 +- .../details/fetch_async_op_handle.cc | 2 +- .../framework/details/fetch_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 2 +- .../details/fused_all_reduce_op_handle.h | 4 +- .../details/fused_broadcast_op_handle.h | 4 +- .../details/fused_broadcast_op_handle_test.cc | 5 +- .../framework/details/nan_inf_utils_detail.cc | 2 +- .../framework/details/nan_inf_utils_detail.cu | 18 +++++++ .../fluid/framework/details/nccl_op_handle.h | 50 +++++++++++++++++-- .../fluid/framework/details/op_handle_base.cc | 50 +++++++++++++++---- .../fluid/framework/details/op_handle_base.h | 4 +- .../framework/details/reduce_op_handle.cc | 2 +- .../framework/details/reduce_op_handle.h | 7 +-- .../details/reduce_op_handle_test.cc | 14 +++--- .../details/scale_loss_grad_op_handle.cc | 4 +- .../details/share_tensor_buffer_op_handle.cc | 2 +- 19 files changed, 175 insertions(+), 47 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index dce256ebc4..9d2bf5bf3f 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -45,7 +45,24 @@ if(WITH_GPU) endif() nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) +elseif(WITH_ROCM) + hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place) + hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) + hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor place device_memory_aligment) + hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor + ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) + if(WITH_DISTRIBUTE) + hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + else() + hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + endif() + hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place) cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory @@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass) -if(NOT APPLE AND NOT WIN32 AND WITH_GPU) +if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS}) diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 15866e5482..2fefbd6177 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( place_(place), var_infos_(vars.begin(), vars.end()), gc_(gc) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { dev_ctx_ = reinterpret_cast( platform::DeviceContextPool::Instance().Get(place)); if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard( BOOST_GET_CONST(platform::CUDAPlace, place).device); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(&event_, hipEventDisableTiming)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); +#endif PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument( "The cuda envet created is NULL.")); } @@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( } EagerDeletionOpHandle::~EagerDeletionOpHandle() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (event_) { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); platform::CUDADeviceGuard guard(gpu_place.device); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); +#endif } #endif } void EagerDeletionOpHandle::InitCUDA() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int dev_id = BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; events_[dev_id] = nullptr; @@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() { void EagerDeletionOpHandle::ClearGarbages( std::deque> *garbages) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (event_) { auto compute_stream = dev_ctx_->stream(); auto callback_stream = reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(callback_stream, event_, 0)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamWaitEvent(callback_stream, event_, 0)); +#endif }; gc_->Add(std::move(*garbages), callback_func); } else { #endif gc_->Add(std::move(*garbages)); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif } diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h index c5079798d9..b1b8c21230 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.h +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase { std::vector var_infos_; // not own GarbageCollector *gc_; // not own std::vector vars_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::CUDADeviceContext *dev_ctx_{nullptr}; - cudaEvent_t event_{nullptr}; + gpuEvent_t event_{nullptr}; #endif }; diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 5fb13491ae..f59d947e27 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item, const platform::DeviceContext &ctx) { if (src_item->IsInitialized() && src_item->numel() > 0) { if (platform::is_gpu_place(src_item->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 8a1ba6f48a..0a116cd9d8 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item, framework::LoDTensor *dst_item) { if (src_item.IsInitialized() && src_item.numel() > 0) { if (platform::is_gpu_place(src_item.place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TensorCopy(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index a5284468b6..f792f7f896 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -26,7 +26,7 @@ namespace details { typedef std::vector>> GradientAndLoDTensor; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, const size_t num_of_all_reduce, diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index 463460a1ff..d22dc0a421 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -33,7 +33,7 @@ namespace platform { class NCCLCommunicator; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) @@ -44,7 +44,7 @@ namespace paddle { namespace framework { namespace details { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index ee45521c21..2fd1e0e7e9 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -36,7 +36,7 @@ struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -46,7 +46,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index d12a1cdc77..42c815f958 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_device_ == p::kCUDA) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else @@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) { test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); } -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL) +#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \ + (defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)) TEST(FusedBroadcastTester, GPULodTensor) { TestFusedBroadcastOpHandle test_op; std::vector input_scope_idxes = {0, 1}; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 06de2d2973..103dd0c5ae 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, << ", place:" << tensor->place() << ", numel:" << tensor->numel(); if (platform::is_gpu_place(tensor->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor_check(op_type, var_name, *tensor, place); #else diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index a46b4d0e5a..55261cf7cd 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value, } __syncthreads; +#ifdef PADDLE_WITH_HIP + if (true && hipThreadIdx_x == 0) { + printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x, + nan_count, inf_count, num_count); +#else if (true && threadIdx.x == 0) { printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x, nan_count, inf_count, num_count); +#endif PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info); } } @@ -150,9 +156,15 @@ void TensorCheckerVisitor::apply( "op_var2gpu_str, but now failed", op_var)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, + hipMemcpyHostToDevice, dev_ctx->stream())); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, cudaMemcpyHostToDevice, dev_ctx->stream())); +#endif } else { // get auto iter = op_var2gpu_str.find(op_var); PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true, @@ -168,8 +180,14 @@ void TensorCheckerVisitor::apply( size_t blocks = std::min(static_cast(128), static_cast((tensor_.numel() + threads - 1) / threads)); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0, + dev_ctx->stream(), tensor_.data(), tensor_.numel(), + print_num, gpu_str_ptr); +#else CheckNanInfKernel<<stream()>>>( tensor_.data(), tensor_.numel(), print_num, gpu_str_ptr); +#endif } template <> diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index eb536560b6..762f4071b5 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -21,7 +21,12 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/nccl.h" +#endif +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/rccl.h" +#endif #include "paddle/fluid/platform/nccl_helper.h" DECLARE_bool(sync_nccl_allreduce); @@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase { } virtual ~NCCLOpHandleBase() { for (auto& ev : inter_events_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); +#endif } for (auto& ev : exter_events_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); +#endif } } void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { @@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase { } platform::SetDeviceId(dev_id); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags( + &inter_events_[dev_id], hipEventDisableTiming)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags( + &exter_events_[dev_id], hipEventDisableTiming)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( &inter_events_[dev_id], cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( &exter_events_[dev_id], cudaEventDisableTiming)); +#endif VLOG(10) << "Create events on dev_id:" << dev_id << ", inter_event:" << &inter_events_[dev_id] << ", exter_event:" << &exter_events_[dev_id]; @@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); +#ifdef PADDLE_WITH_HIP + hipEventRecord(inter_events_.at(dev_id), stream); +#else cudaEventRecord(inter_events_.at(dev_id), stream); +#endif if (FLAGS_sync_nccl_allreduce) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); +#endif } } @@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; +#ifdef PADDLE_WITH_HIP + hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, count, datatype, op, comm, stream)); + + hipEventRecord(exter_events_.at(dev_id), stream); + + if (FLAGS_sync_nccl_allreduce) { + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); + } +#else cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( @@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase { if (FLAGS_sync_nccl_allreduce) { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); } +#endif } void InterBroadCast(platform::Place place, void* sendbuff, size_t count, @@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase { << ", numel:" << count << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; - +#ifdef PADDLE_WITH_HIP + hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0); +#else cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); +#endif PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( sendbuff, count, datatype, 0, comm, stream)); } @@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase { private: // hierarchical needed events - std::unordered_map inter_events_; - std::unordered_map exter_events_; + std::unordered_map inter_events_; + std::unordered_map exter_events_; }; } // namespace details diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index b7f9315325..4b5d0563d7 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const { } OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (auto &ev : events_) { if (ev.second) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); +#endif } } #endif } void OpHandleBase::InitCUDA() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (auto &p : dev_ctxes_) { int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; platform::SetDeviceId(dev_id); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); +#endif } if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { for (auto &out_var : outputs_) { @@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() { } void OpHandleBase::Run(DeviceType use_device) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) { InitCUDA(); } @@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) { } void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument( "Argument waited_ctx is NULL.")); if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { @@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { auto stream = static_cast(waited_ctx)->stream(); for (auto &ev : events_) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); +#endif } } #else @@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = static_cast(dev_ctxes_.at(place)) ->stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#endif #else PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with CUDA.")); @@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) { if (in_var_handle) { auto &place = in_var_handle->place(); if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto stream = static_cast(pool.Get(place)) ->stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); +#else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); +#endif #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with CUDA.")); @@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto *in_var_handle = dynamic_cast(in_var); if (in_var_handle) { if (platform::is_gpu_place(in_var_handle->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#endif #else PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with CUDA.")); @@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) { void OpHandleBase::RunAndRecordEvent(const std::function &callback) { callback(); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!events_.empty()) { // Use event for (auto &p : dev_ctxes_) { auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; auto *cuda_dev_ctx = static_cast(p.second); VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); +#else PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream())); +#endif } } #endif @@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function &callback) { void OpHandleBase::RunAndRecordEvent(platform::Place p, const std::function &callback) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cpu_place(p) || events_.empty()) { callback(); } else { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 11df07e20e..93bdf92f19 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -157,8 +157,8 @@ class OpHandleBase { std::vector local_exec_scopes_; bool skip_running_ = false; -#ifdef PADDLE_WITH_CUDA - std::unordered_map events_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::unordered_map events_; #endif DISABLE_COPY_AND_ASSIGN(OpHandleBase); diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index c7189928d0..1d78a650f9 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 011c5ef2f1..569699c19c 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -40,7 +40,7 @@ namespace platform { struct NCCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/bkcl_helper.h" @@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase { std::vector GetLocalScopes() override { return local_scopes_; } -#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \ + defined PADDLE_WITH_DISTRIBUTE template void GatherSelectedRows( const std::vector &src_selecte_rows_, diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 0ae53b35a4..82f5ea6a66 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -40,7 +40,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::unique_ptr nccl_ctxs_; #endif @@ -48,7 +48,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -58,7 +58,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -83,7 +83,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_ctxs_.reset(nullptr); #endif } @@ -104,7 +104,7 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else @@ -112,7 +112,7 @@ struct TestReduceOpHandle { platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } else { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else @@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) { test_op.InitReduceOp(out_scope_idx); test_op.TestReduceLodTensors(out_scope_idx); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TEST(ReduceTester, TestGPUReduceTestSelectedRows) { TestReduceOpHandle test_op; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index aa32a248e7..fcfbfd0557 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -70,7 +70,7 @@ struct ScaleLossGradFunctor { "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) OutT cast_coeff = static_cast(coeff_); auto stream = static_cast(ctx_)->stream(); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data, @@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() { local_exec_scopes_[0]->FindVar(var_name)->GetMutable(); tensor->Resize(make_ddim({1})); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 0b14b33cf8..f75cd982f7 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) { } void ShareTensorBufferOpHandle::InitCUDA() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) int dev_id = BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device; events_[dev_id] = nullptr; -- GitLab