[ROCM] update fluid framework for rocm (part3), test=develop (#31011)

50967135 · Qi Li · GitHub · cf43a321 · 50967135 · 50967135
19 changed file
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -45,7 +45,24 @@ if(WITH_GPU)
    endif()
    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
    nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
+elseif(WITH_ROCM)
+    hip_library(nan_inf_utils SRCS nan_inf_utils_detail.cc nan_inf_utils_detail.cu DEPS framework_proto scope place)
+    hip_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
+    hip_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor place device_memory_aligment)
+    hip_library(grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor
+            ddim memory dynload_cuda variable_visitor place device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle)

+    if(WITH_DISTRIBUTE)
+        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+                    ddim dynload_cuda selected_rows_functor)
+    else()
+        hip_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
+                    ddim dynload_cuda selected_rows_functor)
+    endif()
+    hip_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+    hip_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 else()
    cc_library(nan_inf_utils SRCS nan_inf_utils_detail.cc DEPS framework_proto scope place)
    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
@@ -118,7 +135,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
    coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
    fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
    sync_batch_norm_pass runtime_context_cache_pass)
-if(NOT APPLE AND NOT WIN32 AND WITH_GPU)
+if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
  set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,7 @@

 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif

@@ -40,15 +40,20 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
      place_(place),
      var_infos_(vars.begin(), vars.end()),
      gc_(gc) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_gpu_place(place)) {
    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
        platform::DeviceContextPool::Instance().Get(place));
    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
      platform::CUDADeviceGuard guard(
          BOOST_GET_CONST(platform::CUDAPlace, place).device);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+#endif
      PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
                                          "The cuda envet created is NULL."));
    }
@@ -64,17 +69,21 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }

 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (event_) {
    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
    platform::CUDADeviceGuard guard(gpu_place.device);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+#endif
  }
 #endif
 }

 void EagerDeletionOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  int dev_id =
      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
  events_[dev_id] = nullptr;
@@ -141,21 +150,27 @@ void EagerDeletionOpHandle::RunImpl() {

 void EagerDeletionOpHandle::ClearGarbages(
    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (event_) {
    auto compute_stream = dev_ctx_->stream();
    auto callback_stream =
        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
    auto callback_func = [=]() {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipStreamWaitEvent(callback_stream, event_, 0));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaStreamWaitEvent(callback_stream, event_, 0));
+#endif
    };
    gc_->Add(std::move(*garbages), callback_func);
  } else {
 #endif
    gc_->Add(std::move(*garbages));
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  }
 #endif
 }

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -82,9 +82,9 @@ class EagerDeletionOpHandle : public OpHandleBase {
  std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
  GarbageCollector *gc_;                        // not own
  std::vector<Variable *> vars_;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  platform::CUDADeviceContext *dev_ctx_{nullptr};
-  cudaEvent_t event_{nullptr};
+  gpuEvent_t event_{nullptr};
 #endif
 };


--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -122,7 +122,7 @@ static void TransData(const framework::Tensor *src_item,
                      const platform::DeviceContext &ctx) {
  if (src_item->IsInitialized() && src_item->numel() > 0) {
    if (platform::is_gpu_place(src_item->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
    } else {

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -114,7 +114,7 @@ static void TransData(const framework::LoDTensor &src_item,
                      framework::LoDTensor *dst_item) {
  if (src_item.IsInitialized() && src_item.numel() > 0) {
    if (platform::is_gpu_place(src_item.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
    } else {

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -26,7 +26,7 @@ namespace details {
 typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
    GradientAndLoDTensor;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
    ir::Node *node, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places, const size_t num_of_all_reduce,

--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -44,7 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  FusedAllReduceOpHandle(ir::Node *node,
                         const std::vector<Scope *> &local_scopes,

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -36,7 +36,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -46,7 +46,7 @@ namespace details {

 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
 public:
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  FusedBroadcastOpHandle(ir::Node *node,
                         const std::vector<Scope *> local_scopes,
                         const std::vector<platform::Place> &places,

--- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
@@ -57,7 +57,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation));
    if (use_device_ == p::kCUDA) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      op_handle_ = new FusedBroadcastOpHandle(
          nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
 #else
@@ -166,7 +166,8 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
  test_op.TestFusedBroadcastSelectedRows(input_scope_idxes);
 }

-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
+#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
+    (defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
 TEST(FusedBroadcastTester, GPULodTensor) {
  TestFusedBroadcastOpHandle test_op;
  std::vector<size_t> input_scope_idxes = {0, 1};

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -318,7 +318,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
           << ", place:" << tensor->place() << ", numel:" << tensor->numel();

  if (platform::is_gpu_place(tensor->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    tensor_check<platform::CUDADeviceContext>(op_type, var_name, *tensor,
                                              place);
 #else

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -82,9 +82,15 @@ __device__ __forceinline__ void PrintNanInfKernel(const T* value,
  }
  __syncthreads;

+#ifdef PADDLE_WITH_HIP
+  if (true && hipThreadIdx_x == 0) {
+    printf("In block %d, there has %u,%u,%u nan,inf,num\n", hipBlockIdx_x,
+           nan_count, inf_count, num_count);
+#else
  if (true && threadIdx.x == 0) {
    printf("In block %d, there has %u,%u,%u nan,inf,num\n", blockIdx.x,
           nan_count, inf_count, num_count);
+#endif
    PADDLE_ENFORCE(false, "===ERROR: in %s find nan or inf===", debug_info);
  }
 }
@@ -150,9 +156,15 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
                            "op_var2gpu_str, but now failed",
                            op_var));

+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
+                         hipMemcpyHostToDevice, dev_ctx->stream()));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1,
                          cudaMemcpyHostToDevice, dev_ctx->stream()));
+#endif
    } else {  // get
      auto iter = op_var2gpu_str.find(op_var);
      PADDLE_ENFORCE_EQ(iter != op_var2gpu_str.end(), true,
@@ -168,8 +180,14 @@ void TensorCheckerVisitor<platform::CUDADeviceContext>::apply(
  size_t blocks =
      std::min(static_cast<size_t>(128),
               static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(CheckNanInfKernel, dim3(blocks), dim3(threads), 0,
+                     dev_ctx->stream(), tensor_.data<T>(), tensor_.numel(),
+                     print_num, gpu_str_ptr);
+#else
  CheckNanInfKernel<<<blocks, threads, 0, dev_ctx->stream()>>>(
      tensor_.data<T>(), tensor_.numel(), print_num, gpu_str_ptr);
+#endif
 }

 template <>

--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -21,7 +21,12 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/nccl_helper.h"

 DECLARE_bool(sync_nccl_allreduce);
@@ -46,10 +51,18 @@ class NCCLOpHandleBase : public OpHandleBase {
  }
  virtual ~NCCLOpHandleBase() {
    for (auto& ev : inter_events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
    }
    for (auto& ev : exter_events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
    }
  }
  void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
@@ -95,10 +108,17 @@ class NCCLOpHandleBase : public OpHandleBase {
      }

      platform::SetDeviceId(dev_id);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+          &inter_events_[dev_id], hipEventDisableTiming));
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventCreateWithFlags(
+          &exter_events_[dev_id], hipEventDisableTiming));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
          &inter_events_[dev_id], cudaEventDisableTiming));
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
          &exter_events_[dev_id], cudaEventDisableTiming));
+#endif
      VLOG(10) << "Create events on dev_id:" << dev_id
               << ", inter_event:" << &inter_events_[dev_id]
               << ", exter_event:" << &exter_events_[dev_id];
@@ -175,10 +195,18 @@ class NCCLOpHandleBase : public OpHandleBase {
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));

+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(inter_events_.at(dev_id), stream);
+#else
    cudaEventRecord(inter_events_.at(dev_id), stream);
+#endif

    if (FLAGS_sync_nccl_allreduce) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
    }
  }

@@ -199,6 +227,18 @@ class NCCLOpHandleBase : public OpHandleBase {
             << ", dev_id:" << dev_id << ", dtype:" << datatype
             << ", place:" << place << ", stream:" << stream;

+#ifdef PADDLE_WITH_HIP
+    hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, count, datatype, op, comm, stream));
+
+    hipEventRecord(exter_events_.at(dev_id), stream);
+
+    if (FLAGS_sync_nccl_allreduce) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+    }
+#else
    cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);

    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
@@ -209,6 +249,7 @@ class NCCLOpHandleBase : public OpHandleBase {
    if (FLAGS_sync_nccl_allreduce) {
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
    }
+#endif
  }

  void InterBroadCast(platform::Place place, void* sendbuff, size_t count,
@@ -223,8 +264,11 @@ class NCCLOpHandleBase : public OpHandleBase {
             << ", numel:" << count << ", dev_id:" << dev_id
             << ", dtype:" << datatype << ", place:" << place
             << ", stream:" << stream;
-
+#ifdef PADDLE_WITH_HIP
+    hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
+#else
    cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
+#endif
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
        sendbuff, count, datatype, 0, comm, stream));
  }
@@ -241,8 +285,8 @@ class NCCLOpHandleBase : public OpHandleBase {

 private:
  // hierarchical needed events
-  std::unordered_map<int, cudaEvent_t> inter_events_;
-  std::unordered_map<int, cudaEvent_t> exter_events_;
+  std::unordered_map<int, gpuEvent_t> inter_events_;
+  std::unordered_map<int, gpuEvent_t> exter_events_;
 };

 }  // namespace details

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,22 +31,31 @@ std::string OpHandleBase::DebugString() const {
 }

 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  for (auto &ev : events_) {
    if (ev.second) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(ev.second));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
+#endif
    }
  }
 #endif
 }

 void OpHandleBase::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  for (auto &p : dev_ctxes_) {
    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
    platform::SetDeviceId(dev_id);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+#endif
  }
  if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
    for (auto &out_var : outputs_) {
@@ -124,7 +133,7 @@ void OpHandleBase::InitXPU() {
 }

 void OpHandleBase::Run(DeviceType use_device) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) {
    InitCUDA();
  }
@@ -158,7 +167,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }

 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
                                          "Argument waited_ctx is NULL."));
  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
@@ -172,7 +181,11 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
    auto stream =
        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
    for (auto &ev : events_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
+#endif
    }
  }
 #else
@@ -203,12 +216,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
      if (in_var_handle) {
        auto &place = in_var_handle->place();
        if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
          auto stream =
              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
                  ->stream();
+#ifdef PADDLE_WITH_HIP
+          PADDLE_ENFORCE_CUDA_SUCCESS(
+              hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
          PADDLE_ENFORCE_CUDA_SUCCESS(
              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#endif
 #else
          PADDLE_THROW(
              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
@@ -226,13 +244,17 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
        if (in_var_handle) {
          auto &place = in_var_handle->place();
          if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
            platform::DeviceContextPool &pool =
                platform::DeviceContextPool::Instance();
            auto stream =
                static_cast<platform::CUDADeviceContext *>(pool.Get(place))
                    ->stream();
+#ifdef PADDLE_WITH_HIP
+            PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
 #else
            PADDLE_THROW(platform::errors::PreconditionNotMet(
                "Not compiled with CUDA."));
@@ -252,12 +274,17 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
      if (in_var_handle) {
        if (platform::is_gpu_place(in_var_handle->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
          auto stream = static_cast<platform::CUDADeviceContext *>(
                            dev_ctxes_.at(in_var_handle->place()))
                            ->stream();
+#ifdef PADDLE_WITH_HIP
+          PADDLE_ENFORCE_CUDA_SUCCESS(
+              hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
          PADDLE_ENFORCE_CUDA_SUCCESS(
              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#endif
 #else
          PADDLE_THROW(
              platform::errors::PreconditionNotMet("Not compiled with CUDA."));
@@ -285,14 +312,19 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {

 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
  callback();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (!events_.empty()) {  // Use event
    for (auto &p : dev_ctxes_) {
      auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
      auto *cuda_dev_ctx = static_cast<platform::CUDADeviceContext *>(p.second);
      VLOG(10) << "cudadevicecontext:" << cuda_dev_ctx << ", dev_id:" << dev_id;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#endif
    }
  }
 #endif
@@ -300,7 +332,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {

 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                     const std::function<void()> &callback) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (platform::is_cpu_place(p) || events_.empty()) {
    callback();
  } else {

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -157,8 +157,8 @@ class OpHandleBase {
  std::vector<Scope *> local_exec_scopes_;
  bool skip_running_ = false;

-#ifdef PADDLE_WITH_CUDA
-  std::unordered_map<int, cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::unordered_map<int, gpuEvent_t> events_;
 #endif

  DISABLE_COPY_AND_ASSIGN(OpHandleBase);

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -165,7 +165,7 @@ void ReduceOpHandle::RunImpl() {
        }
      });
    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
      VariableVisitor::GetMutableTensor(out_var).mutable_data(

--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -40,7 +40,7 @@ namespace platform {
 struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
@@ -80,7 +80,7 @@ struct ReduceOpHandle : public OpHandleBase {
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  const platform::NCCLContextMap *nccl_ctxs_;
  ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                 const std::vector<platform::Place> &places,
@@ -127,7 +127,8 @@ struct ReduceOpHandle : public OpHandleBase {

  std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }

-#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
+    defined PADDLE_WITH_DISTRIBUTE
  template <typename DevCtx, typename DataType>
  void GatherSelectedRows(
      const std::vector<const SelectedRows *> &src_selecte_rows_,

--- a/paddle/fluid/framework/details/reduce_op_handle_test.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -40,7 +40,7 @@ struct TestReduceOpHandle {
  std::vector<p::Place> gpu_list_;
  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

@@ -48,7 +48,7 @@ struct TestReduceOpHandle {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
@@ -58,7 +58,7 @@ struct TestReduceOpHandle {
  void InitCtxOnGpu(bool use_gpu) {
    use_gpu_ = use_gpu;
    if (use_gpu) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -83,7 +83,7 @@ struct TestReduceOpHandle {
        gpu_list_.push_back(p);
        ctxs_.emplace_back(new p::CPUDeviceContext(p));
      }
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      nccl_ctxs_.reset(nullptr);
 #endif
    }
@@ -104,7 +104,7 @@ struct TestReduceOpHandle {

    nodes.emplace_back(new ir::Node("node"));
    if (use_gpu_) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                          gpu_list_, nccl_ctxs_.get()));
 #else
@@ -112,7 +112,7 @@ struct TestReduceOpHandle {
          platform::errors::PreconditionNotMet("Not compiled with NCLL."));
 #endif
    } else {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
                                          gpu_list_, nccl_ctxs_.get()));
 #else
@@ -296,7 +296,7 @@ TEST(ReduceTester, TestCPUReduceTestLodTensor) {
  test_op.InitReduceOp(out_scope_idx);
  test_op.TestReduceLodTensors(out_scope_idx);
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
  TestReduceOpHandle test_op;

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -70,7 +70,7 @@ struct ScaleLossGradFunctor {
          "Please recompile or reinstall Paddle with XPU support."));
 #endif
    } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      OutT cast_coeff = static_cast<OutT>(coeff_);
      auto stream = static_cast<platform::CUDADeviceContext *>(ctx_)->stream();
      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), out_data,
@@ -95,7 +95,7 @@ void ScaleLossGradOpHandle::RunImpl() {
      local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
  tensor->Resize(make_ddim({1}));

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
                            this->dev_ctxes_.at(place_));
  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -84,7 +84,7 @@ void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
 }

 void ShareTensorBufferOpHandle::InitCUDA() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  int dev_id =
      BOOST_GET_CONST(platform::CUDAPlace, dev_ctxes_.begin()->first).device;
  events_[dev_id] = nullptr;