diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 0cfc82709637f2a1b2f4349ed2a543a3b082309f..21da7a0560ee39ca71540a738424df3e3581f34f 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -14,7 +14,7 @@ if(WITH_DISTRIBUTE)
     DEPS phi_api eager_api gloo_wrapper)
 endif()
 
-if(WITH_NCCL)
+if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
     SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index f38ce8faa7ffb53c8e80855bdd4d6c946767748e..5f1da003313ad36f76282e5b6c48b203c59aa81a 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -14,7 +14,13 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <error.h>
 
 #include <string>
@@ -23,9 +29,19 @@
 #include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
 #include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -56,7 +72,11 @@ class EventManager {
   ~EventManager() {
     if (is_created_) {
       platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_HIP
+      hipEventDestroy(event_);
+#else
       cudaEventDestroy(event_);
+#endif
     }
   }
 
@@ -94,24 +114,42 @@ class EventManager {
                           device_index, device_index_));
 
     platform::CUDADeviceGuard guard(device_index_);
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream()));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, ctx.stream()));
+#endif
   }
 
   bool Query() const {
+#ifdef PADDLE_WITH_HIP
+    gpuError_t err = hipEventQuery(event_);
+    if (err == hipSuccess) {
+      return true;
+    }
+    if (err == hipErrorNotReady) {
+      return false;
+    }
+#else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
       return true;
-    } else if (err == cudaErrorNotReady) {
-      return false;
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(err);
+    }
+    if (err == cudaErrorNotReady) {
       return false;
     }
+#endif
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    return false;
   }
 
   void Synchronize() const {
     if (is_created_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
+#endif
     }
   }
 
@@ -124,12 +162,22 @@ class EventManager {
                             "Event's device %d",
                             device_index, device_index_));
       platform::CUDADeviceGuard guard(device_index_);
+
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(ctx.stream(), event_, 0));
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0));
+#endif
     }
   }
 
  private:
+#ifdef PADDLE_WITH_HIP
+  unsigned int flags_ = hipEventDefault;
+#else
   unsigned int flags_ = cudaEventDefault;
+#endif
+
   bool is_created_{false};
   gpuEvent_t event_{};
   int8_t device_index_{0};
@@ -138,7 +186,13 @@ class EventManager {
   void CreateEvent(int device_index) {
     device_index_ = device_index;
     platform::CUDADeviceGuard guard(device_index);
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(&event_, flags_));
+#else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_));
+#endif
+
     is_created_ = true;
   }
 };
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index dc67205c78f56d8d7012a42dd7383413d73d0692..793f8dacbf8d4a79e138c7b85ffe0453e94af750 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -95,7 +95,11 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
     // If we use the work to do barrier, we should block cpu
     for (auto& place : places_) {
       platform::CUDADeviceGuard gpuGuard(place);
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#endif
     }
   }
   return true;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 2325e645b4c46e184b68dfe1fd6341622cd3ac31..c56f75b46518c26e33b1d3066c59eb5c527fc544 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -30,8 +30,13 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
+#endif
+
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif