From a60d93fb77a055540fe239d97055975ba7dc8e2f Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Tue, 23 Feb 2021 10:09:29 +0800
Subject: [PATCH] [ROCM] update fluid framework for rocm (part2), test=develop
 (#31010)

---
 paddle/fluid/framework/fleet/CMakeLists.txt   | 13 ++++-
 paddle/fluid/framework/fleet/box_wrapper.cc   |  2 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   | 48 +++++++++++++++++++
 paddle/fluid/framework/fleet/box_wrapper.h    |  8 +++-
 .../fluid/framework/fleet/box_wrapper_impl.h  | 12 +++--
 paddle/fluid/framework/fleet/fleet_wrapper.cc | 10 +++-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |  6 +--
 paddle/fluid/framework/fleet/heter_context.h  |  3 +-
 .../framework/fleet/heter_ps/CMakeLists.txt   | 16 ++++---
 .../framework/fleet/heter_ps/hashtable.h      |  6 +--
 .../framework/fleet/heter_ps/hashtable_inl.h  |  6 +--
 .../framework/fleet/heter_ps/heter_comm_inl.h |  4 +-
 .../framework/fleet/heter_ps/heter_resource.h | 18 +++----
 paddle/fluid/framework/fleet/heter_wrapper.cc |  8 ++--
 paddle/fluid/framework/fleet/heter_wrapper.h  |  4 +-
 paddle/fluid/framework/fleet/nccl_wrapper.cc  | 18 +++++--
 paddle/fluid/framework/fleet/nccl_wrapper.h   |  9 ++--
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   |  3 +-
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |  3 +-
 19 files changed, 144 insertions(+), 53 deletions(-)
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 4d0cfb62976..61f3c026f1f 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -4,6 +4,10 @@ if(WITH_PSLIB)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
         DEPS heter_ps)
         add_subdirectory(heter_ps)
+    elseif(WITH_RCCL)
+        hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+        DEPS heter_ps)
+        add_subdirectory(heter_ps)
     else()
         cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
     endif(WITH_NCCL)
@@ -12,11 +16,16 @@ else()
     cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
 endif(WITH_PSLIB)
 
-if(WITH_NCCL)
+if(WITH_NCCL OR WITH_RCCL)
     cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
 endif()
 if(WITH_BOX_PS)
-    nv_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
+    if(WITH_GPU)
+        nv_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
+    endif()
+    if(WITH_ROCM)
+        hip_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
+    endif()
 else()
     cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
 endif(WITH_BOX_PS)
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc
index 2d3e6943822..37fbf47f854 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 
 std::shared_ptr<BoxWrapper> BoxWrapper::s_instance_ = nullptr;
-cudaStream_t BoxWrapper::stream_list_[8];
+gpuStream_t BoxWrapper::stream_list_[8];
 std::shared_ptr<boxps::BoxPSBase> BoxWrapper::boxps_ptr_ = nullptr;
 AfsManager* BoxWrapper::afs_manager = nullptr;
 int BoxWrapper::embedx_dim_ = 8;
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 31809532a69..c9b5abf7a9b 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -142,8 +142,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                     ->stream();
   auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*));
   float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+            hipMemcpyHostToDevice);
+#else
   cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
              cudaMemcpyHostToDevice);
+#endif
 #define EMBEDX_CASE(i, ...)                                                  \
   case i: {                                                                  \
     constexpr size_t EmbedxDim = i;                                          \
@@ -155,6 +160,19 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
     }                                                                        \
   } break
 
+#ifdef PADDLE_WITH_HIP
+#define EXPAND_EMBED_PUSH_CASE(i, ...)                                        \
+  case i: {                                                                   \
+    constexpr size_t ExpandDim = i;                                           \
+    hipLaunchKernelGGL(                                                       \
+        PushCopy<EmbedxDim, ExpandDim>, dim3((total_length + 512 - 1) / 512), \
+        dim3(512), 0, stream, gpu_values,                                     \
+        reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>(      \
+            total_values_gpu),                                                \
+        gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,       \
+        gpu_keys);                                                            \
+  } break
+#else
 #define EXPAND_EMBED_PULL_CASE(i, ...)                                       \
   case i: {                                                                  \
     constexpr size_t ExpandDim = i;                                          \
@@ -166,6 +184,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
         gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,      \
         gpu_keys);                                                           \
   } break
+#endif
 
   switch (hidden_size - 3) {
     EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8);
@@ -187,9 +206,16 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                     platform::DeviceContextPool::Instance().Get(
                         BOOST_GET_CONST(platform::CUDAPlace, place)))
                     ->stream();
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
+                     dim3(512), 0, stream, origin_keys, total_keys, gpu_len,
+                     slot_num, total_len);
+  hipStreamSynchronize(stream);
+#else
   CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
       origin_keys, total_keys, gpu_len, slot_num, total_len);
   cudaStreamSynchronize(stream);
+#endif
 }
 
 void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
@@ -217,12 +243,21 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
   int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
   int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
 
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(gpu_values, grad_values.data(), grad_values.size() * sizeof(float*),
+            hipMemcpyHostToDevice);
+  hipMemcpy(gpu_len, slot_lengths_lod.data(),
+            slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice);
+  hipMemcpy(d_slot_vector, slot_vector_.data(),
+            slot_lengths_lod.size() * sizeof(int), hipMemcpyHostToDevice);
+#else
   cudaMemcpy(gpu_values, grad_values.data(),
              grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_len, slot_lengths_lod.data(),
              slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
   cudaMemcpy(d_slot_vector, slot_vector_.data(),
              slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+#endif
 
 #define EMBEDX_CASE(i, ...)                                                  \
   case i: {                                                                  \
@@ -235,6 +270,18 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
     }                                                                        \
   } break
 
+#ifdef PADDLE_WITH_HIP
+#define EXPAND_EMBED_PUSH_CASE(i, ...)                                       \
+  case i: {                                                                  \
+    constexpr size_t ExpandDim = i;                                          \
+    hipLaunchKernelGGL(PushCopy<EmbedxDim, ExpandDim>,                       \
+        dim3(total_length + 512 - 1) / 512), dim3(512), 0, stream,           \
+        reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
+            total_grad_values_gpu),                                          \
+        gpu_values, gpu_len, hidden_size, expand_embed_dim,                  \
+        slot_lengths.size(), total_length, batch_size, d_slot_vector);       \
+  } break
+#else
 #define EXPAND_EMBED_PUSH_CASE(i, ...)                                       \
   case i: {                                                                  \
     constexpr size_t ExpandDim = i;                                          \
@@ -245,6 +292,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
         gpu_values, gpu_len, hidden_size, expand_embed_dim,                  \
         slot_lengths.size(), total_length, batch_size, d_slot_vector);       \
   } break
+#endif
 
   switch (hidden_size - 3) {
     EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8);
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 399ee744ea9..645d725871a 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -396,7 +396,7 @@ class BoxWrapper {
       const std::string& model_path) {
     if (nullptr != s_instance_) {
       VLOG(3) << "Begin InitializeGPU";
-      std::vector<cudaStream_t*> stream_list;
+      std::vector<gpuStream_t*> stream_list;
       for (int i = 0; i < platform::GetCUDADeviceCount(); ++i) {
         VLOG(3) << "before get context i[" << i << "]";
         platform::CUDADeviceContext* context =
@@ -542,8 +542,12 @@ class BoxWrapper {
       auto* gpu_data = gpu_tensor.data<T>();
       auto len = gpu_tensor.numel();
       data->resize(len);
+#ifdef PADDLE_WITH_HIP
+      hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
+#else
       cudaMemcpy(data->data(), gpu_data, sizeof(T) * len,
                  cudaMemcpyDeviceToHost);
+#endif
     }
     static inline std::pair<int, int> parse_cmatch_rank(uint64_t x) {
       // first 32 bit store cmatch and second 32 bit store rank
@@ -819,7 +823,7 @@ class BoxWrapper {
   }
 
  private:
-  static cudaStream_t stream_list_[8];
+  static gpuStream_t stream_list_[8];
   static std::shared_ptr<boxps::BoxPSBase> boxps_ptr_;
   boxps::PSAgentBase* p_agent_ = nullptr;
   // TODO(hutuxian): magic number, will add a config to specify
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index b4e414dc83e..8832f0a20e3 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -43,7 +43,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
     LoDTensor& total_keys_tensor = keys_tensor[device_id];
@@ -60,11 +60,17 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
         memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t));
     uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
     int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+              hipMemcpyHostToDevice);
+    hipMemcpy(gpu_len, slot_lengths_lod.data(),
+              slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice);
+#else
     cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
                cudaMemcpyHostToDevice);
     cudaMemcpy(gpu_len, slot_lengths_lod.data(),
                slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
-
+#endif
     this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
                    static_cast<int>(slot_lengths.size()),
                    static_cast<int>(total_length));
@@ -124,7 +130,7 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
     int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
     LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 425c8a9f2a7..7ad20aa6e18 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -698,13 +698,14 @@ void FleetWrapper::PushDenseVarsSync(
     Scope* scope, const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names,
     std::vector<::std::future<int32_t>>* push_sparse_status,
     float scale_datanorm, int batch_size, const paddle::platform::Place& place,
-    cudaStream_t stream, cudaEvent_t event) {
+    gpuStream_t stream, gpuEvent_t event) {
   std::vector<paddle::ps::Region> regions;
   for (auto& t : var_names) {
     Variable* var = scope.FindVar(t);
@@ -719,8 +720,13 @@ void FleetWrapper::PushDenseVarsAsync(
     memory::Copy(platform::CUDAPinnedPlace(), pin_g,
                  BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
                  sizeof(float) * count, stream);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    hipEventSynchronize(event);
+#else
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
+#endif
 
     float* g = pin_g;
     if (scale_datanorm >= 0) {
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index aa0da828626..e584fb5e2b9 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -152,14 +152,14 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void PushDenseVarsAsync(
       const Scope& scope, const uint64_t table_id,
       const std::vector<std::string>& var_names,
       std::vector<::std::future<int32_t>>* push_sparse_status,
       float scale_datanorm, int batch_size,
-      const paddle::platform::Place& place, cudaStream_t stream,
-      cudaEvent_t event);
+      const paddle::platform::Place& place, gpuStream_t stream,
+      gpuEvent_t event);
 #endif
 #ifdef PADDLE_WITH_XPU
   void PushDenseVarsAsync(
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 2ea3c10fd87..fc987b523d5 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 
 #include <algorithm>
 #include <map>
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 2eed13c530d..6df2cd52bb4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -1,6 +1,10 @@
-nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc
-heter_resource.h hashtable.h DEPS cub device_context)
-nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS
-heter_comm)
-
-nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+IF(WITH_GPU)
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
+    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    hip_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
+    hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+ENDIF()
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index 11bd6e7aa69..2aa00e84e15 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -45,15 +45,15 @@ class HashTable {
   HashTable(const HashTable&) = delete;
   HashTable& operator=(const HashTable&) = delete;
   void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
-              cudaStream_t stream);
+              gpuStream_t stream);
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
-           cudaStream_t stream);
+           gpuStream_t stream);
   void show();
   void dump_to_cpu(int devid, cudaStream_t stream);
 
   template <typename GradType, typename Sgd>
   void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
-              Sgd sgd, cudaStream_t stream);
+              Sgd sgd, gpuStream_t stream);
 
  private:
   TableContainer<KeyType, ValType>* container_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index ef37ed64c2a..871f9c7857a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -87,7 +87,7 @@ void HashTable<KeyType, ValType>::show() {
 
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
-                                      size_t len, cudaStream_t stream) {
+                                      size_t len, gpuStream_t stream) {
   if (len == 0) {
     return;
   }
@@ -99,7 +99,7 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                          const ValType* d_vals, size_t len,
-                                         cudaStream_t stream) {
+                                         gpuStream_t stream) {
   if (len == 0) {
     return;
   }
@@ -147,7 +147,7 @@ template <typename KeyType, typename ValType>
 template <typename GradType, typename Sgd>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                          const GradType* d_grads, size_t len,
-                                         Sgd sgd, cudaStream_t stream) {
+                                         Sgd sgd, gpuStream_t stream) {
   if (len == 0) {
     return;
   }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index f95d4d3948b..e42a3a324f1 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -25,7 +25,7 @@ __global__ void fill_idx(T* idx, size_t len) {
 }
 
 template <typename T>
-void show_tensor(T* input, size_t len, cudaStream_t stream, std::string name) {
+void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) {
   T tmp[len];
   cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream);
   cudaStreamSynchronize(stream);
@@ -270,7 +270,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
   std::vector<std::shared_ptr<memory::Allocation>> d_key_bufs;
   std::vector<std::shared_ptr<memory::Allocation>> d_val_bufs;
 
-  cudaStream_t streams[stream_num];
+  gpuStream_t streams[stream_num];
   for (int i = 0; i < stream_num; ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(streams[i])));
     auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType));
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index 938164dd194..ad7649a8a33 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -34,16 +34,16 @@ class GPUResource {
 
   int dev_id() const { return dev_id_; }
   int index() const { return index_; }
-  cudaStream_t local_stream(int num) { return local_streams_[num]; }
-  cudaStream_t remote_stream() { return remote_stream_; }
-  cudaStream_t comm_stream(int num) { return comm_streams_[num]; }
+  gpuStream_t local_stream(int num) { return local_streams_[num]; }
+  gpuStream_t remote_stream() { return remote_stream_; }
+  gpuStream_t comm_stream(int num) { return comm_streams_[num]; }
 
   int dev_id_;
   int index_;
   std::vector<int> dev_ids_;
-  cudaStream_t remote_stream_;
-  std::vector<cudaStream_t> local_streams_;
-  std::vector<cudaStream_t> comm_streams_;
+  gpuStream_t remote_stream_;
+  std::vector<gpuStream_t> local_streams_;
+  std::vector<gpuStream_t> comm_streams_;
 };
 
 class HeterPsResource {
@@ -56,9 +56,9 @@ class HeterPsResource {
   int total_gpu();
   int get_index_by_devid(int devid);
   int dev_id(int num);
-  cudaStream_t local_stream(int gpu_num, int stream_num);
-  cudaStream_t remote_stream(int gpu_num);
-  cudaStream_t comm_stream(int gpu_num, int stream_num);
+  gpuStream_t local_stream(int gpu_num, int stream_num);
+  gpuStream_t remote_stream(int gpu_num);
+  gpuStream_t comm_stream(int gpu_num, int stream_num);
 
   std::vector<std::shared_ptr<GPUResource>> resources_;
   std::vector<int> dev_ids_;
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 8e232560ab6..a0667e9adbb 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -114,7 +114,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
     memcpy(data_ptr, tensor->data<void>(),
            tensor->numel() * SizeOfType(tensor->type()));
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory::Copy(platform::CPUPlace(), data_ptr,
                  BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                  tensor->data<void>(),
@@ -129,11 +129,11 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
-                                       cudaStream_t stream) {
+                                       gpuStream_t stream) {
   // const VariableMessage& req_var = request->vars();
   auto* var = scope->FindVar(req_var.varname());
   auto* tensor = var->GetMutable<LoDTensor>();
@@ -157,7 +157,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data =
       tensor->mutable_data(place, ToVarType(req_var.data_type()));
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
                platform::CPUPlace(), req_var.data().data(),
                tensor->numel() * SizeOfType(tensor->type()), stream);
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 55ad218198e..871d2e251b4 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -86,9 +86,9 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
-                           platform::Place place, cudaStream_t stream);
+                           platform::Place place, gpuStream_t stream);
 #endif
   void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
                            platform::Place place);
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 8ba94f4fd7a..3ac95632de6 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -21,7 +21,7 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
 bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
@@ -30,14 +30,14 @@ void NCCLWrapper::InitNCCL() {
 }
 
 void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   nccl_info_.nccl_id_ = nccl_info.nccl_id_;
 #endif
   return;
 }
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   PADDLE_ENFORCE_CUDA_SUCCESS(
       platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
@@ -46,19 +46,23 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
 
 void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
                               const int ranks) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
   platform::SetDeviceId(local_rank);
+#ifdef PADDLE_WITH_RCCL
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
+#else
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
+#endif
 #endif
   return;
 }
 
 void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
                           const std::vector<std::string>& var_names) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   for (auto& name : var_names) {
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
@@ -66,7 +70,11 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
+#ifdef PADDLE_WITH_RCCL
+    hipStreamSynchronize(nccl_info_.stream_);
+#else
     cudaStreamSynchronize(nccl_info_.stream_);
+#endif
   }
 #endif
   return;
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
index 3725a225dbe..e12bfd8b27d 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -25,9 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#if defined(PADDLE_WITH_NCCL)
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
@@ -48,10 +51,10 @@ class NCCLInfo {
   int local_rank_;
   int global_ranks_;
   int my_global_rank_;
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   ncclUniqueId nccl_id_;
   ncclComm_t comm_;
-  cudaStream_t stream_;
+  gpuStream_t stream_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 32eb9418b65..728188e7022 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -26,7 +26,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 
 #include <algorithm>
 #include <deque>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 98e0028e427..8a536fe0b82 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 
 #include <atomic>
 #include <ctime>
-- 
GitLab