From 50967135a503c61931a650bf8cdfe8cb64bde986 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 22 Feb 2021 20:34:08 +0800
Subject: [PATCH] [ROCM] update fluid framework for rocm (part3), test=develop
 (#31011)

---
 paddle/fluid/framework/details/CMakeLists.txt | 19 ++++++-
 .../details/eager_deletion_op_handle.cc       | 27 +++++++---
 .../details/eager_deletion_op_handle.h        |  4 +-
 .../details/fetch_async_op_handle.cc          |  2 +-
 .../framework/details/fetch_op_handle.cc      |  2 +-
 .../details/fused_all_reduce_op_handle.cc     |  2 +-
 .../details/fused_all_reduce_op_handle.h      |  4 +-
 .../details/fused_broadcast_op_handle.h       |  4 +-
 .../details/fused_broadcast_op_handle_test.cc |  5 +-
 .../framework/details/nan_inf_utils_detail.cc |  2 +-
 .../framework/details/nan_inf_utils_detail.cu | 18 +++++++
 .../fluid/framework/details/nccl_op_handle.h  | 50 +++++++++++++++++--
 .../fluid/framework/details/op_handle_base.cc | 50 +++++++++++++++----
 .../fluid/framework/details/op_handle_base.h  |  4 +-
 .../framework/details/reduce_op_handle.cc     |  2 +-
 .../framework/details/reduce_op_handle.h      |  7 +--
 .../details/reduce_op_handle_test.cc          | 14 +++---
 .../details/scale_loss_grad_op_handle.cc      |  4 +-
 .../details/share_tensor_buffer_op_handle.cc  |  2 +-
 19 files changed, 175 insertions(+), 47 deletions(-)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index dce256ebc4..9d2bf5bf3f 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -45,7 +45,24 @@ if(WITH_GPU)
     endif()
     nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
     nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
+elseif(WITH_ROCM)
+    hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
+    hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
+    hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor place device_memory_aligment)
+    hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
+            ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)
 
+    if(WITH_DISTRIBUTE)
+        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+                    ddim dynload_cuda selected_rows_functor)
+    else()
+        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+                    ddim dynload_cuda selected_rows_functor)
+    endif()
+    hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 else()
     cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
     cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass)
-if(NOT APPLE AND NOT WIN32 AND WITH_GPU)
+if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 15866e5482..2fefbd6177 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, place).device);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+#endif
       PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
                                           "The cuda envet created is NULL."));
     }
@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (event_) {
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
     platform::CUDADeviceGuard guard(gpu_place.device);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+#else
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+#endif
   }
 #endif
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int dev_id =
       BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
   events_[dev_id] = nullptr;
@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
         reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
     auto callback_func = [=]() {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipStreamWaitEvent(callback_stream, event_, 0));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaStreamWaitEvent(callback_stream, event_, 0));
+#endif
     };
     gc_->Add(std::move(*garbages), callback_func);
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index c5079798d9..b1b8c21230 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDADeviceContext *dev_ctx_{nullptr};
-  cudaEvent_t event_{nullptr};
+  gpuEvent_t event_{nullptr};
 #endif
 };
 
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 5fb13491ae..f59d947e27 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 8a1ba6f48a..0a116cd9d8 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
                       framework::LoDTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index a5284468b6..f792f7f896 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -26,7 +26,7 @@ namespace details {
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 463460a1ff..d22dc0a421 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -44,7 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index ee45521c21..2fd1e0e7e9 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -36,7 +36,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -46,7 +46,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
index d12a1cdc77..42c815f958 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
     nodes_.emplace_back(
         ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
     if (use_device_ == p::kCUDA) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle_ = new FusedBroadcastOpHandle(
           nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
   test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }
 
-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
+#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
+    (defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
 TEST(FusedBroadcastTester, GPULodTensor) {
   TestFusedBroadcastOpHandle test_op;
   std::vector<size_t> input_scope_idxes = {0, 1};
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 06de2d2973..103dd0c5ae 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     tensor_check<platform::CUDADeviceContext>(op_type, var_name, *tensor,
                                               place);
 #else
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index a46b4d0e5a..55261cf7cd 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
   }
   __syncthreads;
 
+#ifdef PADDLE_WITH_HIP
+  if (true && hipThreadIdx_x == 0) {
+    printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
+           nan_count, inf_count, num_count);
+#else
   if (true && threadIdx.x == 0) {
     printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x,
            nan_count, inf_count, num_count);
+#endif
     PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info);
   }
 }
@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
                             "op_var2gpu_str, but now failed",
                             op_var));
 
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
+                         hipMemcpyHostToDevice, dev_ctx->stream()));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                           cudaMemcpyHostToDevice, dev_ctx->stream()));
+#endif
     } else {  // get
       auto iter = op_var2gpu_str.find(op_var);
       PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
   size_t blocks =
       std::min(static_cast<size_t>(128),
                static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
+                     dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
+                     print_num, gpu_str_ptr);
+#else
   CheckNanInfKernel<<<blocks, threads, 0, dev_ctx->stream()>>>(
       tensor_.data<T>(), tensor_.numel(), print_num, gpu_str_ptr);
+#endif
 }
 
 template <>
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index eb536560b6..762f4071b5 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -21,7 +21,12 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/nccl_helper.h"
 
 DECLARE_bool(sync_nccl_allreduce);
@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
   }
   virtual ~NCCLOpHandleBase() {
     for (auto& ev : inter_events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
     }
     for (auto& ev : exter_events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
     }
   }
   void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
       }
 
       platform::SetDeviceId(dev_id);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+          &inter_events_[dev_id], hipEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+          &exter_events_[dev_id], hipEventDisableTiming));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
           &exter_events_[dev_id], cudaEventDisableTiming));
+#endif
       VLOG(10) << "Create events on dev_id:" << dev_id
                << ", inter_event:" << &inter_events_[dev_id]
                << ", exter_event:" << &exter_events_[dev_id];
@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
         sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(inter_events_.at(dev_id), stream);
+#else
     cudaEventRecord(inter_events_.at(dev_id), stream);
+#endif
 
     if (FLAGS_sync_nccl_allreduce) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
     }
   }
 
@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place << ", stream:" << stream;
 
+#ifdef PADDLE_WITH_HIP
+    hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, count, datatype, op, comm, stream));
+
+    hipEventRecord(exter_events_.at(dev_id), stream);
+
+    if (FLAGS_sync_nccl_allreduce) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+    }
+#else
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
     if (FLAGS_sync_nccl_allreduce) {
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
+#endif
   }
 
   void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", numel:" << count << ", dev_id:" << dev_id
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
-
+#ifdef PADDLE_WITH_HIP
+    hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
+#else
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
+#endif
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
         sendbuff, count, datatype, 0, comm, stream));
   }
@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {
 
  private:
   // hierarchical needed events
-  std::unordered_map<int, cudaEvent_t> inter_events_;
-  std::unordered_map<int, cudaEvent_t> exter_events_;
+  std::unordered_map<int, gpuEvent_t> inter_events_;
+  std::unordered_map<int, gpuEvent_t> exter_events_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index b7f9315325..4b5d0563d7 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   for (auto &ev : events_) {
     if (ev.second) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
     }
   }
 #endif
 }
 
 void OpHandleBase::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   for (auto &p : dev_ctxes_) {
     int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
     platform::SetDeviceId(dev_id);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+#else
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+#endif
   }
   if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
     for (auto &out_var : outputs_) {
@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
     InitCUDA();
   }
@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
                                           "Argument waited_ctx is NULL."));
   if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
     auto stream =
         static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
     for (auto &ev : events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
+#endif
     }
   }
 #else
@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           auto stream =
               static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                   ->stream();
+#ifdef PADDLE_WITH_HIP
+          PADDLE_ENFORCE_CUDA_SUCCESS(
+              hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#endif
 #else
           PADDLE_THROW(
               platform::errors::PreconditionNotMet("Not compiled with CUDA."));
@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
                 static_cast<platform::CUDADeviceContext *>(pool.Get(place))
                     ->stream();
+#ifdef PADDLE_WITH_HIP
+            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
             PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
 #else
             PADDLE_THROW(platform::errors::PreconditionNotMet(
                 "Not compiled with CUDA."));
@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           auto stream = static_cast<platform::CUDADeviceContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
+#ifdef PADDLE_WITH_HIP
+          PADDLE_ENFORCE_CUDA_SUCCESS(
+              hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#endif
 #else
           PADDLE_THROW(
               platform::errors::PreconditionNotMet("Not compiled with CUDA."));
@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
       auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
       VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#endif
     }
   }
 #endif
@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 11df07e20e..93bdf92f19 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -157,8 +157,8 @@ class OpHandleBase {
   std::vector<Scope *> local_exec_scopes_;
   bool skip_running_ = false;
 
-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index c7189928d0..1d78a650f9 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       auto pre_in = pre_in_var->Get<framework::LoDTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 011c5ef2f1..569699c19c 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -40,7 +40,7 @@ namespace platform {
 struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                  const std::vector<platform::Place> &places,
@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
 
-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
+    defined PADDLE_WITH_DISTRIBUTE
   template <typename DevCtx, typename DataType>
   void GatherSelectedRows(
       const std::vector<const SelectedRows *> &src_selecte_rows_,
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
index 0ae53b35a4..82f5ea6a66 100644
--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
   std::vector<p::Place> gpu_list_;
   std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
 
@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
     for (size_t j = 0; j < ctxs_.size(); ++j) {
       ctxs_[j]->Wait();
     }
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (nccl_ctxs_) {
       nccl_ctxs_->WaitAll();
     }
@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
   void InitCtxOnGpu(bool use_gpu) {
     use_gpu_ = use_gpu;
     if (use_gpu) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       int count = p::GetCUDADeviceCount();
       if (count <= 1) {
         LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
         gpu_list_.push_back(p);
         ctxs_.emplace_back(new p::CPUDeviceContext(p));
       }
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       nccl_ctxs_.reset(nullptr);
 #endif
     }
@@ -104,7 +104,7 @@ struct TestReduceOpHandle {
 
     nodes.emplace_back(new ir::Node("node"));
     if (use_gpu_) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
           platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                           gpu_list_, nccl_ctxs_.get()));
 #else
@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
   test_op.InitReduceOp(out_scope_idx);
   test_op.TestReduceLodTensors(out_scope_idx);
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
   TestReduceOpHandle test_op;
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index aa32a248e7..fcfbfd0557 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
       local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
   tensor->Resize(make_ddim({1}));
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
                             this->dev_ctxes_.at(place_));
   this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 0b14b33cf8..f75cd982f7 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int dev_id =
       BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
   events_[dev_id] = nullptr;
-- 
GitLab