[ROCM] update fluid framework for rocm (part6), test=develop (#31015)

28b356b9 · Qi Li · GitHub · c8fac5ee · 28b356b9 · 28b356b9
19 changed file
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
    EXPECT_EQ(static_cast<int>(p2[0]), 1);
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    framework::Tensor src_tensor;
    float* p1 = nullptr;
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    framework::Tensor src_tensor;
    framework::Tensor dst_tensor;
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    framework::Tensor src_tensor;
    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
        "Copy from %s to %s is not supported.", src_place, dst_place));
  }
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
           platform::is_cuda_pinned_place(dst_place)) {
    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
  CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
  }

  void VisitorImpl(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
    constexpr int MAX_BLOCK_DIM = 512;
    const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
                      platform::errors::ResourceExhausted(
                          "tensor size %d overflow when writing tensor", size));
    if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
      std::unique_ptr<char[]> buf(new char[kBufSize]);
      auto& gpu_dev_ctx =
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(shape));
      framework::VisitDataType(
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
    size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
    if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
        platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
      Tensor cpu_tensor;
      cpu_tensor.Resize(framework::make_ddim(dims));
      framework::VisitDataType(
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
  if (dl_tensor.ctx.device_type == kDLCPU) {
    memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (dl_tensor.ctx.device_type == kDLGPU) {
    platform::CUDAPlace dst_place =
        platform::CUDAPlace(dl_tensor.ctx.device_id);

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 src_place, src_ptr, size);
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                 src_place, src_ptr, size);
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
    memory::Copy(
        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
                 size);
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
  }
  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    Tensor src_tensor;
    Tensor gpu_tensor;
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
    delete cpu_place;
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    paddle::framework::Tensor cpu_tensor;
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
      EXPECT_EQ(src_ptr[i], dst[i]);
    }
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    paddle::framework::Tensor gpu_tensor;
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
    }
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
    paddle::framework::Tensor cpu_tensor;
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
    EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
    delete place;
  }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  {
    Tensor gpu_tensor;
    gpu_tensor.Resize({2, 3});

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
  {
    Tensor tensor;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    ASSERT_TRUE(TensorContainsNAN(tensor));
  }
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
    Tensor tensor;
    paddle::platform::float16* buf =
        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    ASSERT_TRUE(TensorContainsNAN(tensor));
  }
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
  {
    Tensor tensor;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    ASSERT_TRUE(TensorContainsInf(tensor));
  }
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
    Tensor tensor;
    paddle::platform::float16* buf =
        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    ASSERT_TRUE(TensorContainsInf(tensor));
  }
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
  {
    Tensor tensor;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(!TensorIsfinite(tensor));
  }
  {
    Tensor tensor;
    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(!TensorIsfinite(tensor));
  }
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
  {
    Tensor tensor;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(!TensorIsfinite(tensor));
  }
  {
    Tensor tensor;
    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(!TensorIsfinite(tensor));
  }
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
  {
    Tensor tensor;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(TensorIsfinite(tensor));
  }
  {
    Tensor tensor;
    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    EXPECT_TRUE(TensorIsfinite(tensor));
  }
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
  {
    Tensor tensor, out;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorContainsInf(tensor, &out);
    platform::CPUPlace cpu;
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
    Tensor tensor, out;
    paddle::platform::float16* buf =
        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorContainsInf(tensor, &out);
    platform::CPUPlace cpu;
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
  {
    Tensor tensor, out;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorContainsNAN(tensor, &out);
    platform::CPUPlace cpu;
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
    Tensor tensor, out;
    paddle::platform::float16* buf =
        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorContainsNAN(tensor, &out);
    platform::CPUPlace cpu;
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
  {
    Tensor tensor, out;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorIsfinite(tensor, &out);
    platform::CPUPlace cpu;
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
  {
    Tensor tensor, out;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorIsfinite(tensor, &out);
    platform::CPUPlace cpu;
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
  {
    Tensor tensor, out;
    float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
    cuda_ctx->Wait();
    TensorIsfinite(tensor, &out);
    platform::CPUPlace cpu;

--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
  std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };

-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
    (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
 public:
@@ -155,8 +156,9 @@ class HeterServiceContext {
  void Reset() { push_dense_status_.clear(); }
  int place_num_;
  Scope* scope_{nullptr};
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event_;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  gpuEvent_t event_;
 #endif
  std::vector<OperatorBase*> ops_;
  std::vector<::std::future<int32_t>> push_dense_status_;
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
  template <typename T>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
-                   cudaStream_t stream);
+                   gpuStream_t stream);
 #endif
 #ifdef PADDLE_WITH_XPU
  void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
  std::vector<Scope*> place_scopes_;
  BtObjectPool<HeterServiceContext> object_pool_;
  std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
-  std::vector<cudaStream_t> copy_streams_;
-  std::vector<cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::vector<gpuStream_t> copy_streams_;
+  std::vector<gpuEvent_t> events_;
 #endif
 };

@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
  virtual std::string GetDumpPath(int tid) { return ""; }
  virtual void InitDumpEnv() {}
  template <typename T>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                   const paddle::platform::Place& thread_place,
-                   cudaStream_t stream);
+                   gpuStream_t stream);
 #endif
  void CreateThreadParam(const ProgramDesc& program, int num);
  template <typename T>
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
  std::vector<std::thread> threads_;
  int use_ps_gpu_;
  int thread_num_;
-#ifdef PADDLE_WITH_CUDA
-  std::vector<cudaStream_t> copy_streams_;
-  std::vector<cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::vector<gpuStream_t> copy_streams_;
+  std::vector<gpuEvent_t> events_;
 #endif
 };
 #endif

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 class PSGPUTrainer : public TrainerBase {
 public:
  PSGPUTrainer() {}
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class PipelineTrainer : public TrainerBase {
 public:
  PipelineTrainer() {}

--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(

 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 REGISTER_TRAINER_CLASS(HeterBoxTrainer);
 #endif
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework

--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
+#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
+#include "paddle/fluid/operators/miopen_rnn_cache.h"
+#endif

 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -30,6 +30,12 @@
 #include <nccl.h>
 #endif
 #endif
+#ifdef PADDLE_WITH_HIP
+#include <miopen/miopen.h>
+#ifdef PADDLE_WITH_RCCL
+#include <rccl.h>
+#endif
+#endif

 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
@@ -39,8 +45,8 @@
 namespace paddle {

 namespace platform {
-#ifdef PADDLE_WITH_CUDA
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
 #endif
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
    LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
    operators::reader::LoDTensorBlockingQueueHolder, FetchList,
    operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#ifdef PADDLE_WITH_CUDA
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
    ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
 #endif
    operators::CudnnRNNCache,

--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
+#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
+#include "paddle/fluid/operators/miopen_rnn_cache.h"
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
 #endif

--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
-if (NOT WITH_NCCL)
+if (NOT (WITH_NCCL OR WITH_RCCL))
  return()
 endif()

@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()

-if(WITH_GPU)
+if(WITH_ROCM AND NOT WIN32)
+  hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
    op_library(nccl_op DEPS nccl_common)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
    set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
 endif()

-if(NOT WIN32)
+if(WITH_GPU AND NOT WIN32)
    nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
+
+if(WITH_ROCM AND NOT WIN32)
+    hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -23,7 +23,11 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"


--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
  analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)

-if (WITH_GPU)
+if (WITH_GPU OR WITH_ROCM)
  set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
  set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()

-if (WITH_NCCL)
+if (WITH_NCCL OR WITH_RCCL)
  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
  set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif()
@@ -21,7 +21,7 @@ endif()
 if(NOT WIN32)
  set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
  set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
-  if (WITH_NCCL)
+  if (WITH_NCCL OR WITH_RCCL)
    set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
  endif()
 endif(NOT WIN32)
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
  list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()

-if (WITH_NCCL)
+if (WITH_NCCL OR WITH_RCCL)
  list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()

@@ -81,9 +81,9 @@ if(WITH_PYTHON)
  list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
  list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})

-  if(WITH_NCCL)
+  if (WITH_NCCL OR WITH_RCCL)
    list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
-  endif(WITH_NCCL)
+  endif()

  if(WITH_XPU_BKCL)
    list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context)
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
  target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(op_function_generator ${os_dependency_modules})
+  if(WITH_ROCM)
+    target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
+  endif()

  set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
  set(tmp_impl_file ${impl_file}.tmp)
@@ -164,20 +167,17 @@ if(WITH_PYTHON)
    endif(WITH_MKLDNN)
  endif(WIN32)

-  if(WITH_ROCM_PLATFORM)
-    cc_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-  else()
-    cc_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT WIN32)
-      target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT WIN32)
-  endif(WITH_ROCM_PLATFORM)
+  cc_library(paddle_pybind SHARED
+  SRCS ${PYBIND_SRCS}
+  DEPS ${PYBIND_DEPS}
+  ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+  if(NOT APPLE AND NOT WIN32)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT WIN32)
+
+  if(WITH_ROCM)
+    target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
+  endif()

  get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(paddle_pybind ${os_dependency_modules})

--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
 DECLARE_int32(max_inplace_grad_add);
 DECLARE_string(tracer_profile_fname);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cudnn
 DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
      FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  REGISTER_PUBLIC_GLOBAL_VAR(
      FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
      FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::VarBase &self,
              const imperative::ParallelStrategy &strategy) {
             if (strategy.nranks_ > 1) {
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2212
               imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
 #else
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
              )DOC")
      .def("pin_memory",
           [](const std::shared_ptr<imperative::VarBase> &self) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot copy this Tensor to pinned memory in CPU version "
                 "Paddle, "
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
      .def("cuda",
           [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
              bool blocking) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
      },
      py::call_guard<py::gil_scoped_release>());

-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
  py::class_<imperative::ParallelContext,
             std::shared_ptr<imperative::ParallelContext>>(m,
                                                           "ParallelContext");

--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -32,7 +32,8 @@ namespace py = pybind11;

 namespace paddle {
 namespace pybind {
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 void BindPSGPUWrapper(py::module* m) {
  py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
      *m, "PSGPU")

--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -22,7 +22,8 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {

-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 void BindPSGPUWrapper(py::module* m);
 #endif
 }  // namespace pybind

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,7 +86,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"

-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -95,11 +95,13 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#ifdef PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/cuda_profiler.h"
+#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 namespace paddle {
 namespace pybind {
 bool IsCompiledWithCUDA() {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool IsCompiledWithROCM() {
+#ifndef PADDLE_WITH_HIP
  return false;
 #else
  return true;
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {

  m.def("set_num_threads", &platform::SetNumThreads);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  m.def("cudnn_version", &platform::CudnnVersion);
 #endif

@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
    if (dl.ctx.device_type == kDLCPU) {
      paddle::framework::TensorFromDLPack(dl, &tensor);
    }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (dl.ctx.device_type == kDLGPU) {
      paddle::framework::TensorFromDLPack(dl, &tensor);
    }
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
      .def("height", &SelectedRows::height)
      .def("set_rows",
           [](SelectedRows &self, std::vector<int64_t> rows) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             self.set_rows(rows);
 #else
        Vector<int64_t> new_rows(rows);
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def_static("create",
                  [](paddle::platform::CUDAPlace& place)
                      -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(
                 platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
          .def_static("create",
                [](paddle::platform::CUDAPinnedPlace& place)
                        -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(
                 platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                });;
 // clang-format on
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
  py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
        )DOC")
      .def("__init__",
           [](platform::CUDAPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             if (UNLIKELY(dev_id < 0)) {
               LOG(ERROR) << string::Sprintf(
                   "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
             std::exit(-1);
 #endif
           })
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      .def("get_device_id",
           [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
      .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
        )DOC")
      .def("__init__",
           [](platform::CUDAPinnedPlace &self) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("init_devices", []() { framework::InitDevices(); });

  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_rocm", IsCompiledWithROCM);
  m.def("is_compiled_with_xpu", IsCompiledWithXPU);
  m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
  m.def("supports_bfloat16", SupportsBfloat16);
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
        py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
        py::arg("redirect_stderr") = false);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
    // Only GPUs with Compute Capability >= 53 support float16
    return platform::GetCUDAComputeCapability(place.device) >= 53;
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);

  m.def("op_support_gpu", OpSupportGPU);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);

-#ifndef _WIN32
+#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
  m.def("nvprof_init", platform::CudaProfilerInit);
  m.def("nvprof_start", platform::CudaProfilerStart);
  m.def("nvprof_stop", platform::CudaProfilerStop);
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.

  m.def("size_of_dtype", framework::SizeOfType);

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
  m.def("get_cublas_switch", platform::AllowTF32Cublas);
  m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_PSLIB
  BindHeterWrapper(&m);
 #endif
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
  BindPSGPUWrapper(&m);
 #endif
  BindGlooWrapper(&m);
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_BOX_PS
  BindBoxWrapper(&m);
 #endif
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  BindNCCLWrapper(&m);
 #endif
 #ifdef PADDLE_WITH_GLOO

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
 #endif
  } else if (platform::is_gpu_place(self.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    const T *a = self.data<T>();
    auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
 #endif
  } else if (platform::is_gpu_place(self->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
    T *a = self->mutable_data<T>(p);
    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
        "Please recompile or reinstall Paddle with XPU support."));
 #endif
  } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (paddle::platform::is_gpu_place(place)) {
      // NOTE(wangxi): When copying data to the accelerator card,
      // we need set_device(dev_id) first.
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
      platform::CUDADeviceGuard guard(
          BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
      auto dst = self->mutable_data<T>(place);
+#ifdef PADDLE_WITH_HIP
+      paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
+                                      hipMemcpyHostToDevice);
+#else
      paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
                                      cudaMemcpyHostToDevice);
+#endif

    } else if (paddle::platform::is_cuda_pinned_place(place)) {
      auto dst = self->mutable_data<T>(place);
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
                         self.type());
 #endif
  } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    if (platform::is_cuda_pinned_place(place)) {
      output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                           self.type());
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
        "Please recompile or reinstall Paddle with XPU support."));
 #endif
  } else if (is_gpu_tensor) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
                      platform::errors::InvalidArgument(