[ROCM] update fluid framework for rocm (part4), test=develop (#31013)

580447d0 · Qi Li · GitHub · 7d91974c · 580447d0 · 580447d0
19 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto

 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+if(WITH_GPU)
+  nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+elseif(WITH_ROCM)
+  hip_test(dim_test SRCS dim_test.cu DEPS ddim)
+endif()
 cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
@@ -46,6 +50,8 @@ if(WITH_GPU)
  else()
    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
  endif(WIN32)
+elseif(WITH_ROCM)
+  hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
 endif()
@@ -53,6 +59,8 @@ endif()
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 if(WITH_GPU)
  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
+elseif(WITH_ROCM)
+  hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
 else()
  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor)
 endif()
@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

 if(WITH_GPU)
  nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
+elseif(WITH_ROCM)
+  hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
 else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)

 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+
+if(WITH_GPU)
+  nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+elseif(WITH_ROCM)
+  hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+endif()

 cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)

@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)

 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
-nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+if(WITH_GPU)
+  nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function scope)
+elseif(WITH_ROCM)
+  hip_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry device_context math_function scope)
+endif()

 if(WITH_GPU)
  if (WIN32)
@@ -108,6 +128,9 @@ if(WITH_GPU)
      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
  endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
+elseif(WITH_ROCM)
+  hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
  cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator

 cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
 cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
-
-nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+if(WITH_GPU)
+  nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+elseif(WITH_ROCM)
+  hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+endif()

 if(WITH_PYTHON)
  py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)

--- a/paddle/fluid/framework/array.h
+++ b/paddle/fluid/framework/array.h
@@ -54,7 +54,7 @@ class Array {
  }

  HOSTDEVICE inline T &at(size_t i) {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
    PADDLE_ENFORCE_LT(
        i, N, platform::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -62,7 +62,7 @@ class Array {
  }

  HOSTDEVICE inline const T &at(size_t i) const {
-#ifndef __CUDA_ARCH__
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
    PADDLE_ENFORCE_LT(
        i, N, platform::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -103,7 +103,12 @@ class Array<T, 0> {
  HOSTDEVICE inline T *GetMutable() { return nullptr; }

  HOSTDEVICE inline T &operator[](size_t) {
-#ifdef __CUDA_ARCH__
+#if defined(__HIPCC__)
+    // HIP will have compile error, if use "obj()"
+    // function declared in block scope cannot have 'static' storage class
+    static T obj{};
+    return obj;
+#elif defined(__CUDA_ARCH__)
    static T obj();
    return obj;
 #else
@@ -112,7 +117,12 @@ class Array<T, 0> {
  }

  HOSTDEVICE inline const T &operator[](size_t) const {
-#ifdef __CUDA_ARCH__
+#if defined(__HIPCC__)
+    // HIP will have compile error, if use "obj()"
+    // function declared in block scope cannot have 'static' storage class
+    static const T obj{};
+    return obj;
+#elif defined(__CUDA_ARCH__)
    static const T obj();
    return obj;
 #else

--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -16,7 +16,12 @@ limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
+
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_helper.h"
+#else
 #include "paddle/fluid/platform/cudnn_helper.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -32,7 +37,20 @@ class ConvSearchCache {
    static ConvSearchCache instance;
    return instance;
  }
-
+#ifdef PADDLE_WITH_HIP
+  AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetForward() {
+    return &forward_cache_;
+  }
+  AlgorithmsCache<miopenConvBwdDataAlgorithm_t>* GetBackwardData() {
+    return &backward_data_cache_;
+  }
+  AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t>* GetBackwardFilter() {
+    return &backward_filter_cache_;
+  }
+  AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
+    return &fusion_forward_cache_;
+  }
+#else
  AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
    return &forward_cache_;
  }
@@ -45,6 +63,7 @@ class ConvSearchCache {
  AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
    return &fusion_forward_cache_;
  }
+#endif

 private:
  ConvSearchCache() {}
@@ -52,10 +71,17 @@ class ConvSearchCache {
  ConvSearchCache(const ConvSearchCache&) {}
  ConvSearchCache& operator=(const ConvSearchCache&) {}

+#ifdef PADDLE_WITH_HIP
+  AlgorithmsCache<miopenConvFwdAlgorithm_t> forward_cache_;
+  AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
+  AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
+  AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
+#else
  AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
  AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
  AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_;
  AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
+#endif
 };

 }  // namespace framework

--- a/paddle/fluid/framework/copy_same_tensor_test.cc
+++ b/paddle/fluid/framework/copy_same_tensor_test.cc
@@ -31,7 +31,7 @@ namespace framework {
 static std::vector<platform::Place> CreatePlaceList() {
  std::vector<platform::Place> places;
  places.emplace_back(platform::CPUPlace());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  places.emplace_back(platform::CUDAPlace(0));
 #endif
  return places;

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
  } else {
 #ifdef PADDLE_WITH_CUDA
    cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_HIP)
+    hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
 #else
    PADDLE_THROW(platform::errors::Unimplemented(
-        "Not supported GPU, please compile with option WITH_GPU=ON."));
+        "Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
+        "WITH_ROCM=ON."));
 #endif
  }
 }
@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
  for (size_t i = 0; i < use_slots_.size(); ++i) {

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
  int pv_batch_size_;
 };

-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
 public:

--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
 REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -47,7 +47,7 @@ struct CastDataType {
      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
      trans(*context, in_begin, in_end, out_begin,
            CastDataTypeFunctor<InType, OutType>());
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
    } else if (platform::is_gpu_place(in_.place())) {
      platform::Transform<platform::CUDADeviceContext> trans;
      auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar(
      });
    }
  } else if (platform::is_gpu_place(in_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    VarHandle *out_handle = nullptr;
    int root_id =
        BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;

--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -34,7 +34,7 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 namespace platform {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 struct NCCLContextMap;
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -43,7 +43,7 @@ struct BKCLContextMap;
 }  // namespace platform
 }  // namespace paddle

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
@@ -55,7 +55,7 @@ namespace details {

 struct BroadcastOpHandle : public OpHandleBase {
 public:
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                    const std::vector<platform::Place> &places,
                    const platform::NCCLContextMap *nccl_ctxs)
@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase {

  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  const platform::NCCLContextMap *nccl_ctxs_;
 #elif defined(PADDLE_WITH_XPU_BKCL)
  const platform::BKCLContextMap *bkcl_ctxs_;

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
  test_op.TestBroadcastSelectedRows(input_scope_idx);
 }

-#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
+#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
+    (defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
 TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
  TestBroadcastOpHandle test_op;
  size_t input_scope_idx = 0;

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h
@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle {
  std::vector<std::unique_ptr<ir::Node>> nodes_;
  std::vector<p::Place> place_list_;
  DeviceType use_device_;
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif

@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
    for (size_t j = 0; j < ctxs_.size(); ++j) {
      ctxs_[j]->Wait();
    }
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    if (nccl_ctxs_) {
      nccl_ctxs_->WaitAll();
    }
@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle {
          platform::errors::PreconditionNotMet("Not compiled with BKCL."));
 #endif
    } else if (use_device_ == p::kCUDA) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      int count = p::GetCUDADeviceCount();
      if (count <= 1) {
        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle {
 #if defined(PADDLE_WITH_XPU_BKCL)
      bkcl_ctxs_.reset(nullptr);
 #endif
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      nccl_ctxs_.reset(nullptr);
 #endif
    }
@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle {
    nodes_.emplace_back(
        ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
    if (use_device_ == p::kCUDA) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
      op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
                                         place_list_, nccl_ctxs_.get());
 #else

--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -52,7 +52,7 @@ class DeviceContext;
 }  // namespace platform
 }  // namespace paddle

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

@@ -73,11 +73,12 @@ class PullDenseWorker {
 public:
  virtual ~PullDenseWorker() {}
  virtual void Initialize(const TrainerDesc& param);
-#ifdef PADDLE_WITH_CUDA
-  void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif

-#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
  void AddPlace(const paddle::platform::Place place) {
    places_.push_back(place);
  }
@@ -137,8 +138,8 @@ class PullDenseWorker {
  float total_batch_num_ = 0;
  std::unordered_map<const Scope*, int> scope_to_thread_id_;

-#ifdef PADDLE_WITH_CUDA
-  std::vector<cudaStream_t> copy_streams_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::vector<gpuStream_t> copy_streams_;
 #endif
  std::vector<paddle::platform::Place> places_;
  std::vector<Scope*> thread_scopes_;
@@ -167,9 +168,9 @@ class DeviceWorker {
  virtual void CacheProgram(const ProgramDesc& main_program) {}
  virtual void ProduceTasks() {}
  virtual void GetXpuOpIndex() {}
-#ifdef PADDLE_WITH_CUDA
-  virtual void SetStream(const cudaStream_t stream) {}
-  virtual void SetEvent(const cudaEvent_t event) {}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  virtual void SetStream(const gpuStream_t stream) {}
+  virtual void SetEvent(const gpuEvent_t event) {}
 #endif
  virtual void SetNeedDumpField(bool need_dump_field) {
    need_dump_field_ = need_dump_field;
@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker {
 };
 #endif

-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
    (defined PADDLE_WITH_PSLIB)
 class HeterBoxWorker : public HogwildWorker {
 public:
@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker {
    new (&program_) ProgramDesc(main_program);
  }
  virtual void ProduceTasks() override;
-  virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
-  virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
+  virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
+  virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
  virtual void TrainFilesWithProfiler() {}
  void ResetStat();

@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker {
  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
  paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
  paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-  cudaEvent_t event_;
-  cudaStream_t copy_stream_;
+  gpuEvent_t event_;
+  gpuStream_t copy_stream_;
  int batch_cnt_{0};
  std::atomic<int> done_cnt_{0};

@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker {
 };
 #endif

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 class PSGPUWorker : public HogwildWorker {
 public:
  PSGPUWorker() {}
@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker {
    new (&program_) ProgramDesc(main_program);
  }
  virtual void ProduceTasks() override;
-  virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
-  virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
+  virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
+  virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
  virtual void TrainFilesWithProfiler() {}
  void ResetStat();

@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker {
  std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
  paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
  paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-  cudaEvent_t event_;
-  cudaStream_t copy_stream_;
+  gpuEvent_t event_;
+  gpuStream_t copy_stream_;
  int batch_cnt_{0};
  std::atomic<int> done_cnt_{0};

@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class SectionWorker : public DeviceWorker {
 public:
  SectionWorker() {}

--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
 REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 #endif

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 #endif

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/dim_test.cu
+++ b/paddle/fluid/framework/dim_test.cu
@@ -34,7 +34,12 @@ TEST(Dim, Equality) {

  // construct a Dim on the GPU
  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
+                     thrust::raw_pointer_cast(t.data()));
+#else
  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+#endif
  a = t[0];
  EXPECT_EQ(a[0], 5);
  EXPECT_EQ(a[1], 6);
@@ -55,7 +60,12 @@ TEST(Dim, Equality) {

  // dynamic access on GPU
  thrust::device_vector<int64_t> r(1);
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0,
+                     thrust::raw_pointer_cast(r.data()));
+#else
  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+#endif
  int64_t res = r[0];
  EXPECT_EQ(res, 6);
 }

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  }

  inline ::DLContext operator()(const platform::CUDAPlace &place) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    ::DLContext ctx;
    ctx.device_type = kDLGPU;
    ctx.device_id = place.device;
@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
  }

  inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    ::DLContext ctx;
    ctx.device_type = kDLCPUPinned;
    ctx.device_id = 0;

--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place,

 template <typename T>
 void TestMainLoop() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::vector<platform::Place> places{platform::CPUPlace(),
                                      platform::CUDAPlace(0),
                                      platform::CUDAPinnedPlace()};

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
  std::unique_ptr<GarbageCollector> gc;
  if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
    if (platform::is_gpu_place(place_)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (IsFastEagerDeletionModeEnabled()) {
        gc.reset(new UnsafeFastGPUGarbageCollector(
            BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));

--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace framework {

 const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

  static int64_t num_cuda_devices = -1;
  static std::once_flag num_devices_init_flag;
@@ -157,7 +157,7 @@ uint64_t Generator::Random64() {
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
    uint64_t increament_offset) {
  uint64_t cur_offset = this->state_.thread_offset;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  std::lock_guard<std::mutex> lock(this->mu_);

  this->state_.thread_offset += increament_offset;