[ROCM] update fluid framework for rocm (part2), test=develop (#31010)

a60d93fb · Qi Li · GitHub · 565354f6 · a60d93fb · a60d93fb
19 changed file
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -4,6 +4,10 @@ if(WITH_PSLIB)
        nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
        DEPS heter_ps)
        add_subdirectory(heter_ps)
+    elseif(WITH_RCCL)
+        hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+        DEPS heter_ps)
+        add_subdirectory(heter_ps)
    else()
        cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
    endif(WITH_NCCL)
@@ -12,11 +16,16 @@ else()
    cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
 endif(WITH_PSLIB)
-if(WITH_NCCL)
+if(WITH_NCCL OR WITH_RCCL)
    cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
 endif()
 if(WITH_BOX_PS)
+    if(WITH_GPU)
        nv_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
+    endif()
+    if(WITH_ROCM)
+        hip_library(box_wrapper SRCS box_wrapper.cc box_wrapper.cu DEPS framework_proto lod_tensor box_ps)
+    endif()
 else()
    cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor)
 endif(WITH_BOX_PS)

--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {
 std::shared_ptr<BoxWrapper> BoxWrapper::s_instance_ = nullptr;
-cudaStream_t BoxWrapper::stream_list_[8];
+gpuStream_t BoxWrapper::stream_list_[8];
 std::shared_ptr<boxps::BoxPSBase> BoxWrapper::boxps_ptr_ = nullptr;
 AfsManager* BoxWrapper::afs_manager = nullptr;
 int BoxWrapper::embedx_dim_ = 8;

--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -142,8 +142,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
                    ->stream();
  auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*));
  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+            hipMemcpyHostToDevice);
+#else
  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
             cudaMemcpyHostToDevice);
+#endif
 #define EMBEDX_CASE(i, ...)                                                  \
  case i: {                                                                  \
    constexpr size_t EmbedxDim = i;                                          \
@@ -155,6 +160,19 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
    }                                                                        \
  } break
+#ifdef PADDLE_WITH_HIP
+#define EXPAND_EMBED_PUSH_CASE(i, ...)                                        \
+  case i: {                                                                   \
+    constexpr size_t ExpandDim = i;                                           \
+    hipLaunchKernelGGL(                                                       \
+        PushCopy<EmbedxDim, ExpandDim>, dim3((total_length + 512 - 1) / 512), \
+        dim3(512), 0, stream, gpu_values,                                     \
+        reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>(      \
+            total_values_gpu),                                                \
+        gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,       \
+        gpu_keys);                                                            \
+  } break
+#else
 #define EXPAND_EMBED_PULL_CASE(i, ...)                                       \
  case i: {                                                                  \
    constexpr size_t ExpandDim = i;                                          \
@@ -166,6 +184,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
        gpu_len, hidden_size, expand_embed_dim, slot_num, total_length,      \
        gpu_keys);                                                           \
  } break
+#endif
  switch (hidden_size - 3) {
    EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8);
@@ -187,9 +206,16 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                    platform::DeviceContextPool::Instance().Get(
                        BOOST_GET_CONST(platform::CUDAPlace, place)))
                    ->stream();
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
+                     dim3(512), 0, stream, origin_keys, total_keys, gpu_len,
+                     slot_num, total_len);
+  hipStreamSynchronize(stream);
+#else
  CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
      origin_keys, total_keys, gpu_len, slot_num, total_len);
  cudaStreamSynchronize(stream);
+#endif
 }
 void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
@@ -217,12 +243,21 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(gpu_values, grad_values.data(), grad_values.size() * sizeof(float*),
+            hipMemcpyHostToDevice);
+  hipMemcpy(gpu_len, slot_lengths_lod.data(),
+            slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice);
+  hipMemcpy(d_slot_vector, slot_vector_.data(),
+            slot_lengths_lod.size() * sizeof(int), hipMemcpyHostToDevice);
+#else
  cudaMemcpy(gpu_values, grad_values.data(),
             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
  cudaMemcpy(d_slot_vector, slot_vector_.data(),
             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+#endif
 #define EMBEDX_CASE(i, ...)                                                  \
  case i: {                                                                  \
@@ -235,6 +270,18 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
    }                                                                        \
  } break
+#ifdef PADDLE_WITH_HIP
+#define EXPAND_EMBED_PUSH_CASE(i, ...)                                       \
+  case i: {                                                                  \
+    constexpr size_t ExpandDim = i;                                          \
+    hipLaunchKernelGGL(PushCopy<EmbedxDim, ExpandDim>,                       \
+        dim3(total_length + 512 - 1) / 512), dim3(512), 0, stream,           \
+        reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
+            total_grad_values_gpu),                                          \
+        gpu_values, gpu_len, hidden_size, expand_embed_dim,                  \
+        slot_lengths.size(), total_length, batch_size, d_slot_vector);       \
+  } break
+#else
 #define EXPAND_EMBED_PUSH_CASE(i, ...)                                       \
  case i: {                                                                  \
    constexpr size_t ExpandDim = i;                                          \
@@ -245,6 +292,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
        gpu_values, gpu_len, hidden_size, expand_embed_dim,                  \
        slot_lengths.size(), total_length, batch_size, d_slot_vector);       \
  } break
+#endif
  switch (hidden_size - 3) {
    EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8);

--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -396,7 +396,7 @@ class BoxWrapper {
      const std::string& model_path) {
    if (nullptr != s_instance_) {
      VLOG(3) << "Begin InitializeGPU";
-      std::vector<cudaStream_t*> stream_list;
+      std::vector<gpuStream_t*> stream_list;
      for (int i = 0; i < platform::GetCUDADeviceCount(); ++i) {
        VLOG(3) << "before get context i[" << i << "]";
        platform::CUDADeviceContext* context =
@@ -542,8 +542,12 @@ class BoxWrapper {
      auto* gpu_data = gpu_tensor.data<T>();
      auto len = gpu_tensor.numel();
      data->resize(len);
+#ifdef PADDLE_WITH_HIP
+      hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
+#else
      cudaMemcpy(data->data(), gpu_data, sizeof(T) * len,
                 cudaMemcpyDeviceToHost);
+#endif
    }
    static inline std::pair<int, int> parse_cmatch_rank(uint64_t x) {
      // first 32 bit store cmatch and second 32 bit store rank
@@ -819,7 +823,7 @@ class BoxWrapper {
  }
 private:
-  static cudaStream_t stream_list_[8];
+  static gpuStream_t stream_list_[8];
  static std::shared_ptr<boxps::BoxPSBase> boxps_ptr_;
  boxps::PSAgentBase* p_agent_ = nullptr;
  // TODO(hutuxian): magic number, will add a config to specify

--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -43,7 +43,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
    PADDLE_THROW(platform::errors::Unimplemented(
        "Warning:: CPUPlace is not supported in PaddleBox now."));
  } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
    VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
    LoDTensor& total_keys_tensor = keys_tensor[device_id];
@@ -60,11 +60,17 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
        memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t));
    uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
    int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+#ifdef PADDLE_WITH_HIP
+    hipMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+              hipMemcpyHostToDevice);
+    hipMemcpy(gpu_len, slot_lengths_lod.data(),
+              slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice);
+#else
    cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
               cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_len, slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+#endif
    this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
                   static_cast<int>(slot_lengths.size()),
                   static_cast<int>(total_length));
@@ -124,7 +130,7 @@ void BoxWrapper::PushSparseGradCase(
    PADDLE_THROW(platform::errors::Unimplemented(
        "Warning:: CPUPlace is not supported in PaddleBox now."));
  } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
    int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId();
    LoDTensor& cached_total_keys_tensor = keys_tensor[device_id];
    uint64_t* total_keys =

--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -698,13 +698,14 @@ void FleetWrapper::PushDenseVarsSync(
    Scope* scope, const uint64_t table_id,
    const std::vector<std::string>& var_names) {}
-#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
    const Scope& scope, const uint64_t table_id,
    const std::vector<std::string>& var_names,
    std::vector<::std::future<int32_t>>* push_sparse_status,
    float scale_datanorm, int batch_size, const paddle::platform::Place& place,
-    cudaStream_t stream, cudaEvent_t event) {
+    gpuStream_t stream, gpuEvent_t event) {
  std::vector<paddle::ps::Region> regions;
  for (auto& t : var_names) {
    Variable* var = scope.FindVar(t);
@@ -719,8 +720,13 @@ void FleetWrapper::PushDenseVarsAsync(
    memory::Copy(platform::CUDAPinnedPlace(), pin_g,
                 BOOST_GET_CONST(platform::CUDAPlace, place), g_data,
                 sizeof(float) * count, stream);
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    hipEventSynchronize(event);
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
    cudaEventSynchronize(event);
+#endif
    float* g = pin_g;
    if (scale_datanorm >= 0) {

--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -152,14 +152,14 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void PushDenseVarsAsync(
      const Scope& scope, const uint64_t table_id,
      const std::vector<std::string>& var_names,
      std::vector<::std::future<int32_t>>* push_sparse_status,
      float scale_datanorm, int batch_size,
-      const paddle::platform::Place& place, cudaStream_t stream,
+      const paddle::platform::Place& place, gpuStream_t stream,
-      cudaEvent_t event);
+      gpuEvent_t event);
 #endif
 #ifdef PADDLE_WITH_XPU
  void PushDenseVarsAsync(

--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -14,7 +14,8 @@ limitations under the License. */
 #pragma once
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 #include <algorithm>
 #include <map>

--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
-nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc
+IF(WITH_GPU)
-heter_resource.h hashtable.h DEPS cub device_context)
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
-nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS
+    nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
-heter_comm)
+    nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+ENDIF()
-nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+IF(WITH_ROCM)
+    hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    hip_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
+    hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+ENDIF()
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -45,15 +45,15 @@ class HashTable {
  HashTable(const HashTable&) = delete;
  HashTable& operator=(const HashTable&) = delete;
  void insert(const KeyType* d_keys, const ValType* d_vals, size_t len,
-              cudaStream_t stream);
+              gpuStream_t stream);
  void get(const KeyType* d_keys, ValType* d_vals, size_t len,
-           cudaStream_t stream);
+           gpuStream_t stream);
  void show();
  void dump_to_cpu(int devid, cudaStream_t stream);
  template <typename GradType, typename Sgd>
  void update(const KeyType* d_keys, const GradType* d_grads, size_t len,
-              Sgd sgd, cudaStream_t stream);
+              Sgd sgd, gpuStream_t stream);
 private:
  TableContainer<KeyType, ValType>* container_;

--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -87,7 +87,7 @@ void HashTable<KeyType, ValType>::show() {
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
-                                      size_t len, cudaStream_t stream) {
+                                      size_t len, gpuStream_t stream) {
  if (len == 0) {
    return;
  }
@@ -99,7 +99,7 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 template <typename KeyType, typename ValType>
 void HashTable<KeyType, ValType>::insert(const KeyType* d_keys,
                                         const ValType* d_vals, size_t len,
-                                         cudaStream_t stream) {
+                                         gpuStream_t stream) {
  if (len == 0) {
    return;
  }
@@ -147,7 +147,7 @@ template <typename KeyType, typename ValType>
 template <typename GradType, typename Sgd>
 void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
                                         const GradType* d_grads, size_t len,
-                                         Sgd sgd, cudaStream_t stream) {
+                                         Sgd sgd, gpuStream_t stream) {
  if (len == 0) {
    return;
  }

--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -25,7 +25,7 @@ __global__ void fill_idx(T* idx, size_t len) {
 }
 template <typename T>
-void show_tensor(T* input, size_t len, cudaStream_t stream, std::string name) {
+void show_tensor(T* input, size_t len, gpuStream_t stream, std::string name) {
  T tmp[len];
  cudaMemcpyAsync(&tmp, input, sizeof(T) * len, cudaMemcpyDeviceToHost, stream);
  cudaStreamSynchronize(stream);
@@ -270,7 +270,7 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
  std::vector<std::shared_ptr<memory::Allocation>> d_key_bufs;
  std::vector<std::shared_ptr<memory::Allocation>> d_val_bufs;
-  cudaStream_t streams[stream_num];
+  gpuStream_t streams[stream_num];
  for (int i = 0; i < stream_num; ++i) {
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(streams[i])));
    auto d_k_buf = memory::AllocShared(place, chunk_size * sizeof(KeyType));

--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -34,16 +34,16 @@ class GPUResource {
  int dev_id() const { return dev_id_; }
  int index() const { return index_; }
-  cudaStream_t local_stream(int num) { return local_streams_[num]; }
+  gpuStream_t local_stream(int num) { return local_streams_[num]; }
-  cudaStream_t remote_stream() { return remote_stream_; }
+  gpuStream_t remote_stream() { return remote_stream_; }
-  cudaStream_t comm_stream(int num) { return comm_streams_[num]; }
+  gpuStream_t comm_stream(int num) { return comm_streams_[num]; }
  int dev_id_;
  int index_;
  std::vector<int> dev_ids_;
-  cudaStream_t remote_stream_;
+  gpuStream_t remote_stream_;
-  std::vector<cudaStream_t> local_streams_;
+  std::vector<gpuStream_t> local_streams_;
-  std::vector<cudaStream_t> comm_streams_;
+  std::vector<gpuStream_t> comm_streams_;
 };
 class HeterPsResource {
@@ -56,9 +56,9 @@ class HeterPsResource {
  int total_gpu();
  int get_index_by_devid(int devid);
  int dev_id(int num);
-  cudaStream_t local_stream(int gpu_num, int stream_num);
+  gpuStream_t local_stream(int gpu_num, int stream_num);
-  cudaStream_t remote_stream(int gpu_num);
+  gpuStream_t remote_stream(int gpu_num);
-  cudaStream_t comm_stream(int gpu_num, int stream_num);
+  gpuStream_t comm_stream(int gpu_num, int stream_num);
  std::vector<std::shared_ptr<GPUResource>> resources_;
  std::vector<int> dev_ids_;

--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -114,7 +114,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
    memcpy(data_ptr, tensor->data<void>(),
           tensor->numel() * SizeOfType(tensor->type()));
  } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    memory::Copy(platform::CPUPlace(), data_ptr,
                 BOOST_GET_CONST(platform::CUDAPlace, tensor->place()),
                 tensor->data<void>(),
@@ -129,11 +129,11 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope,
  }
 }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                       const VariableMessage& req_var,
                                       platform::Place place,
-                                       cudaStream_t stream) {
+                                       gpuStream_t stream) {
  // const VariableMessage& req_var = request->vars();
  auto* var = scope->FindVar(req_var.varname());
  auto* tensor = var->GetMutable<LoDTensor>();
@@ -157,7 +157,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
  void* tensor_data =
      tensor->mutable_data(place, ToVarType(req_var.data_type()));
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data,
               platform::CPUPlace(), req_var.data().data(),
               tensor->numel() * SizeOfType(tensor->type()), stream);

--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -86,9 +86,9 @@ class HeterWrapper {
  framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
-                           platform::Place place, cudaStream_t stream);
+                           platform::Place place, gpuStream_t stream);
 #endif
  void DeSerializeToTensor(Scope* scope, const VariableMessage& req_var,
                           platform::Place place);

--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -21,7 +21,7 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
 bool NCCLWrapper::is_initialized_ = false;
 void NCCLWrapper::InitNCCL() {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
      &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
      nccl_info_.my_global_rank_));
@@ -30,14 +30,14 @@ void NCCLWrapper::InitNCCL() {
 }
 void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  nccl_info_.nccl_id_ = nccl_info.nccl_id_;
 #endif
  return;
 }
 NCCLInfo NCCLWrapper::GetNCCLId() {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  PADDLE_ENFORCE_CUDA_SUCCESS(
      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
@@ -46,19 +46,23 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
 void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
                              const int ranks) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  nccl_info_.local_rank_ = local_rank;
  nccl_info_.my_global_rank_ = global_rank;
  nccl_info_.global_ranks_ = ranks;
  platform::SetDeviceId(local_rank);
+#ifdef PADDLE_WITH_RCCL
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
+#endif
 #endif
  return;
 }
 void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
                          const std::vector<std::string>& var_names) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  for (auto& name : var_names) {
    auto var = scope.FindVar(name);
    LoDTensor* tensor = var->GetMutable<LoDTensor>();
@@ -66,7 +70,11 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
        reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
        root_rank, nccl_info_.comm_, nccl_info_.stream_));
+#ifdef PADDLE_WITH_RCCL
+    hipStreamSynchronize(nccl_info_.stream_);
+#else
    cudaStreamSynchronize(nccl_info_.stream_);
+#endif
  }
 #endif
  return;

--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -25,9 +25,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#if defined(PADDLE_WITH_NCCL)
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 namespace paddle {
@@ -48,10 +51,10 @@ class NCCLInfo {
  int local_rank_;
  int global_ranks_;
  int my_global_rank_;
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  ncclUniqueId nccl_id_;
  ncclComm_t comm_;
-  cudaStream_t stream_;
+  gpuStream_t stream_;
 #endif
 };

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -26,7 +26,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 #include <algorithm>
 #include <deque>

--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -14,7 +14,8 @@ limitations under the License. */
 #pragma once
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 #include <atomic>
 #include <ctime>