[ROCM] update fluid framework for rocm (part5), test=develop (#31014)

c8fac5ee · Qi Li · GitHub · 580447d0 · c8fac5ee · c8fac5ee
19 changed file
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,7 @@
 // limitations under the License.

 #include <functional>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "gflags/gflags.h"
@@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {}
@@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
                                               size_t max_memory_size)
    : GarbageCollector(place, max_memory_size) {
  platform::CUDADeviceGuard guard(place.device);
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
+#endif
  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
 }

 StreamGarbageCollector::~StreamGarbageCollector() {
  auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
  platform::CUDADeviceGuard guard(place.device);
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
+#endif
 }

-cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+gpuStream_t StreamGarbageCollector::stream() const { return stream_; }

 void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }


--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector {
 };
 #endif

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:
  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
@@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector {

  void Wait() const override;

-  cudaStream_t stream() const;
+  gpuStream_t stream() const;

 protected:
  void ClearCallback(const std::function<void()> &callback) override;

 private:
-  cudaStream_t stream_;
+  gpuStream_t stream_;
  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
 };


--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -152,7 +152,7 @@ class HeterObjectPool {
    std::lock_guard<std::mutex> lock(mutex_);
    if (pool_.empty()) {
      num_ += 1;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      VLOG(0) << "pool construct size: " << num_;
 #endif
      return std::make_shared<T>();

--- a/paddle/fluid/framework/heterbox_trainer.cc
+++ b/paddle/fluid/framework/heterbox_trainer.cc
@@ -21,9 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
    (defined PADDLE_WITH_PSLIB)
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 namespace paddle {
@@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc,
      dataset->GetReaders();
  for (int i = 0; i < place_num; ++i) {
    int num = trainer_desc.worker_places(i);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    platform::CUDAPlace place = platform::CUDAPlace(num);
    platform::CUDADeviceGuard guard(place.device);
-    cudaStream_t stream;
+    gpuStream_t stream;
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+#endif
    copy_streams_.push_back(stream);
    places_.push_back(place);
-    cudaEvent_t event;
+    gpuEvent_t event;
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(
        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+#endif
    events_.push_back(event);
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program,
        _ForEachDataType_(HeterMemcpyFunc);
      }
    }
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+    hipEventSynchronize(event);
+#else
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
    cudaEventSynchronize(event);
+#endif
  }
  place_ = place;
 }
@@ -150,7 +165,7 @@ template <typename T>
 void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor,
                                  LoDTensor* root_tensor,
                                  const paddle::platform::Place& thread_place,
-                                  cudaStream_t stream) {
+                                  gpuStream_t stream) {
  T* thread_ptr =
      thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
  T* root_ptr = root_tensor->data<T>();
@@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) {
  for (size_t i = 0; i < places_.size(); ++i) {
    pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
    pull_dense_worker_->AddPlace(places_[i]);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    pull_dense_worker_->AddStream(copy_streams_[i]);
 #endif
  }

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 #include <vector>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif

--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <cuda.h>
-#include <cuda_runtime.h>
 #include <stdio.h>

 #include "gtest/gtest.h"
@@ -34,8 +32,14 @@ TEST(LoD, data) {

  auto& v = lod[0];
  paddle::platform::CUDAPlace gpu(0);
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
+                     v.size());
+  hipDeviceSynchronize();
+#else
  test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
  cudaDeviceSynchronize();
+#endif
  for (size_t i = 0; i < v.size(); ++i) {
    EXPECT_EQ(v[i], i * 2);
  }
@@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) {

  auto lod = lod_tensor.lod();

+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
+                     lod[0].CUDAMutableData(place), lod[0].size());
+  hipDeviceSynchronize();
+#else
  test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
  cudaDeviceSynchronize();
+#endif

  for (size_t i = 0; i < src_lod[0].size(); ++i) {
    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>

--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -12,7 +12,13 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <memory>

 #include "glog/logging.h"
@@ -22,6 +28,7 @@

 template <typename T>
 using vec = paddle::framework::Vector<T>;
+using gpuStream_t = paddle::gpuStream_t;

 static __global__ void multiply_10(int* ptr) {
  for (int i = 0; i < 10; ++i) {
@@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) {
  }
 }

-cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
+gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
  return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
             paddle::platform::DeviceContextPool::Instance().Get(place))
      ->stream();
@@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) {
  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu(0);

+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu),
+                     tmp.MutableData(gpu));
+#else
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
+#endif

  for (int i = 0; i < 10; ++i) {
    ASSERT_EQ(tmp[i], i * 10);
@@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) {
  ASSERT_EQ(tmp.size(), 10UL);
  paddle::platform::CUDAPlace gpu0(0);
  paddle::platform::SetDeviceId(0);
+
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu0),
+                     tmp.MutableData(gpu0));
+#else
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
+#endif
  paddle::platform::CUDAPlace gpu1(1);
  auto* gpu1_ptr = tmp.MutableData(gpu1);
  paddle::platform::SetDeviceId(1);
+
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu1),
+                     gpu1_ptr);
+#else
  multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
+#endif
  for (int i = 0; i < 10; ++i) {
    ASSERT_EQ(tmp[i], i * 100);
  }

--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?

-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -193,7 +193,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  try {
    VLOG(4) << place << " " << DebugStringEx(&scope);
    if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
      PADDLE_THROW(platform::errors::Unavailable(
          "Cannot run operator on place %s, please recompile paddle or "
          "reinstall Paddle with CUDA support.",
@@ -1166,6 +1166,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_CUDA)
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
    VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
+#endif
+#if defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
+    VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
 #endif
  }


--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -384,7 +384,7 @@ class ExecutionContext {
    return device_context_;
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  const inline platform::CUDADeviceContext& cuda_device_context() const {
    PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true,
                      platform::errors::PreconditionNotMet(

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif

@@ -60,7 +60,7 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::once_flag p2p_init_flag;
 #endif

@@ -132,7 +132,7 @@ class ParallelExecutorPrivate {
    }
  }

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
    VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
            << ", num_trainers:" << bst.num_trainers_
@@ -371,7 +371,7 @@ class ParallelExecutorPrivate {

  std::unordered_map<std::string, bool> is_persistable_;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  platform::NCCLCommunicator *nccl_ctxs_{nullptr};
 #elif defined(PADDLE_WITH_XPU_BKCL)
  platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
@@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
    }
    std::unique_ptr<GarbageCollector> gc;
    if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
@@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }

 void InitP2P(const std::vector<platform::Place> &places) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::call_once(p2p_init_flag, [&]() {
    int count = places.size();
    if (count <= 1) return;
@@ -590,14 +590,24 @@ void InitP2P(const std::vector<platform::Place> &places) {
      for (int j = 0; j < count; ++j) {
        if (devices[i] == devices[j]) continue;
        int can_acess = -1;
+#ifdef PADDLE_WITH_HIP
+        hipError_t ret =
+            hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != hipSuccess || can_acess != 1) {
+#else
        cudaError_t ret =
            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
        if (ret != cudaSuccess || can_acess != 1) {
+#endif
          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
                       << " to " << devices[j];
        } else {
          platform::CUDADeviceGuard guard(devices[i]);
+#ifdef PADDLE_WITH_HIP
+          hipDeviceEnablePeerAccess(devices[j], 0);
+#else
          cudaDeviceEnablePeerAccess(devices[j], 0);
+#endif
        }
      }
    }
@@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
        BuildStrategy::ReduceStrategy::kAllReduce;
    member_->use_all_reduce_ = true;
  }
-#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
  if (member_->IsUseCUDA(member_->use_device_)) {
    PADDLE_ENFORCE_EQ(
        places.size(), 1,
@@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  }
 #endif

-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
  if (member_->IsUseCUDA(member_->use_device_)) {
    PADDLE_ENFORCE_EQ(
        places.size(), 1,
@@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  }

  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);

    // Initialize device context's nccl comm, will be used by normal
@@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
  // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
  // ncclOp
  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  if (member_->build_strategy_.async_mode_) {
    VLOG(3) << "use local async mode";
    graph = member_->build_strategy_.Apply(
@@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    final_graphs = async_graphs;
  } else if (member_->build_strategy_.enable_parallel_graph_) {
    VLOG(3) << "use ParallelSSAGraphExecutor";
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    // TODO(Yancey1989): Remove passing in the main_program when
    // allreduce_seq_pass doesn't need it as the attr.
    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
@@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices(
    }
    auto &dims = main_tensor.dims();
    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      std::vector<void *> buffers;
      buffers.reserve(member_->places_.size());
      size_t numel = main_tensor.numel();

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif


--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"

--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -24,7 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 #include "paddle/fluid/platform/cuda_device_guard.h"

 namespace paddle {

--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -19,7 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 #include "paddle/fluid/platform/cuda_device_guard.h"

 #if defined _WIN32 || defined __APPLE__

--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
    current_version_[tid] = 0;
  }
  fleet_ptr_ = FleetWrapper::GetInstance();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  copy_streams_.clear();
 #endif
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
  places_.clear();
  thread_scopes_.clear();
 #endif
 }

 void PullDenseWorker::CreatePinVar() {
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
  // for (auto& v : dense_value_names_) {
  //  for (auto& name : v.second) {
  for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() {
      auto* ptr = root_scope_->Var(name + "pin");
      InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
      LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      pin_tensor->mutable_data<float>(tensor->dims(),
                                      platform::CUDAPinnedPlace());
 #endif
@@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
    exit(-1);
  }
  status_vec->resize(0);
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)

  for (size_t i = 0; i < places_.size(); ++i) {
    // for (auto& v : dense_value_names_) {
@@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
        Variable* var = thread_scopes_[i]->FindVar(name);
        LoDTensor* tensor = var->GetMutable<LoDTensor>();
        float* w = tensor->data<float>();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
                     platform::CUDAPinnedPlace(), pin_w,
                     sizeof(float) * tensor->numel(), copy_streams_[i]);
@@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) {
    uint64_t tid = static_cast<uint64_t>(
        dwp_param_.program_config(0).pull_dense_table_id(i));
    if (force_update || CheckUpdateParam(tid)) {
-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
      VLOG(3) << "pull dense " << force_update << " " << tid;
      fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
                                     &pull_dense_status_, false);

--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name,
        tensor->numel() * framework::SizeOfType(tensor->type());
    auto* data_ptr = tensor->data<void>();
    if (platform::is_gpu_place(tensor->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      framework::Tensor temp;
      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
      data_ptr = temp.data<void>();

--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() {
  std::unique_ptr<GarbageCollector> gc;
  auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_);
  if (max_memory_size >= 0) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_gpu_place(place_)) {
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
@@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() {
        }
      }
    }
+#ifdef PADDLE_WITH_RCCL
+    hipDeviceSynchronize();
+#else
    cudaDeviceSynchronize();
+#endif
  }

  // backward pass
@@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() {
        }
      }
    }
+#ifdef PADDLE_WITH_RCCL
+    hipDeviceSynchronize();
+#else
    cudaDeviceSynchronize();
+#endif
  }

  // update pass