diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 92a29d5165ce790bbbc9673979c5994317bb3dfb..54f779813063362956130ec9314365f89a234c1e 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(static_cast<int>(p2[0]), 1);
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     framework::Tensor src_tensor;
     float* p1 = nullptr;
@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     framework::Tensor src_tensor;
     framework::Tensor dst_tensor;
@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     framework::Tensor src_tensor;
     src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1ad321df216fe16f8731f400026716f1c33b84e3..c6ac30a369859db9de244990231a307074e973ed 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
   return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
   CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
   }
 
   void VisitorImpl(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
     constexpr int MAX_BLOCK_DIM = 512;
     const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx =
@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace())) {
-#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_XPU)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
   if (dl_tensor.ctx.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (dl_tensor.ctx.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.ctx.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 50644370bc6b6f9d3ed72653a67c9f9d301bea5d..8a127e0ed5929525a96914c70f20411a61ed88e6 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
         BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
                  src_place, src_ptr, size);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(
         BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                  BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
                  size);
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(
         dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index e389cb34679a2459b547497a46e97586c7cd72bd..c32efd0a470be201344fa8d7f817792315b7e6ef 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
   }
   EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     Tensor src_tensor;
     Tensor gpu_tensor;
@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
     delete cpu_place;
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     paddle::framework::Tensor cpu_tensor;
@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
       EXPECT_EQ(src_ptr[i], dst[i]);
     }
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     paddle::framework::Tensor gpu_tensor;
@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
     }
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
     paddle::framework::Tensor cpu_tensor;
@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
     EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   {
     Tensor gpu_tensor;
     gpu_tensor.Resize({2, 3});
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index a51f74199e714b8606c9766c57bc6b1dc4c73c65..4517726a5c09d3997a90fff1dd4d2aadd8db27b2 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
   {
     Tensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsNAN(tensor));
   }
@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
     Tensor tensor;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsNAN(tensor));
   }
@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
   {
     Tensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsInf(tensor));
   }
@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
     Tensor tensor;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsInf(tensor));
   }
@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
   {
     Tensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
   {
     Tensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
   {
     Tensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
   {
     Tensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(!TensorIsfinite(tensor));
   }
@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
   {
     Tensor tensor;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
     FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(TensorIsfinite(tensor));
   }
   {
     Tensor tensor;
     float16* buf = tensor.mutable_data<float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
     FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     EXPECT_TRUE(TensorIsfinite(tensor));
   }
@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
   {
     Tensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorContainsInf(tensor, &out);
     platform::CPUPlace cpu;
@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
     Tensor tensor, out;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorContainsInf(tensor, &out);
     platform::CPUPlace cpu;
@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
   {
     Tensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorContainsNAN(tensor, &out);
     platform::CPUPlace cpu;
@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
     Tensor tensor, out;
     paddle::platform::float16* buf =
         tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorContainsNAN(tensor, &out);
     platform::CPUPlace cpu;
@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
   {
     Tensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
   {
     Tensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
+#else
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
   {
     Tensor tensor, out;
     float* buf = tensor.mutable_data<float>({3}, gpu);
+#ifdef PADDLE_WITH_HIP
+    hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
+                       buf);
+#else
     FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+#endif
     cuda_ctx->Wait();
     TensorIsfinite(tensor, &out);
     platform::CPUPlace cpu;
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index d949ba2bffe6cf6957adc0b03b1abb2fe8f5fde2..ca290a50b42fe0bfe37385c15bc54d8c5a305a06 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB)
 class HeterServiceContext {
  public:
@@ -155,8 +156,9 @@ class HeterServiceContext {
   void Reset() { push_dense_status_.clear(); }
   int place_num_;
   Scope* scope_{nullptr};
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event_;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
   std::vector<::std::future<int32_t>> push_dense_status_;
@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
-                   cudaStream_t stream);
+                   gpuStream_t stream);
 #endif
 #ifdef PADDLE_WITH_XPU
   void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#ifdef PADDLE_WITH_CUDA
-  std::vector<cudaStream_t> copy_streams_;
-  std::vector<cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::vector<gpuStream_t> copy_streams_;
+  std::vector<gpuEvent_t> events_;
 #endif
 };
 
@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
-                   cudaStream_t stream);
+                   gpuStream_t stream);
 #endif
   void CreateThreadParam(const ProgramDesc& program, int num);
   template <typename T>
@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
   std::vector<std::thread> threads_;
   int use_ps_gpu_;
   int thread_num_;
-#ifdef PADDLE_WITH_CUDA
-  std::vector<cudaStream_t> copy_streams_;
-  std::vector<cudaEvent_t> events_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::vector<gpuStream_t> copy_streams_;
+  std::vector<gpuEvent_t> events_;
 #endif
 };
 #endif
 
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 class PSGPUTrainer : public TrainerBase {
  public:
   PSGPUTrainer() {}
@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 764338a8cc671b2cb8a696e2389eba197a560425..6b9dbece8974c286a390627f25e4a25ee8bfb8d3 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+     defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 REGISTER_TRAINER_CLASS(HeterBoxTrainer);
 #endif
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 81c7d0d0c884038b65730525bfff2fd9c1192f4f..886d00e562bff1a1de9ebb45162f05cb5033b534 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
+#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
+#include "paddle/fluid/operators/miopen_rnn_cache.h"
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 2fd4de5cfcba42b91a4b2da200df0404e62aa2b5..b0d8f43a90f35f08c7bcc09858842424f53ddbe0 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -30,6 +30,12 @@
 #include <nccl.h>
 #endif
 #endif
+#ifdef PADDLE_WITH_HIP
+#include <miopen/miopen.h>
+#ifdef PADDLE_WITH_RCCL
+#include <rccl.h>
+#endif
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
@@ -39,8 +45,8 @@
 namespace paddle {
 
 namespace platform {
-#ifdef PADDLE_WITH_CUDA
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
 #endif
@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
     operators::reader::LoDTensorBlockingQueueHolder, FetchList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#ifdef PADDLE_WITH_CUDA
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
 #endif
     operators::CudnnRNNCache,
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 9d1bd77ebdf695790e5583bf7ad207664f8ccef9..2a6635c4b6050b8f70a4e517d550b66581ab3a21 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -28,6 +28,14 @@
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"  // NOLINT
+#include "paddle/fluid/platform/nccl_helper.h"            // NOLINT
+#endif
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"  // NOLINT
+#include "paddle/fluid/operators/miopen_rnn_cache.h"
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index 4f1fe372f58491611795c5f100ba9b010b499b21..9a412228255d0d8c3d9f231c719788489de8fe5b 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (NOT WITH_NCCL)
+if (NOT (WITH_NCCL OR WITH_RCCL))
   return()
 endif()
 
@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
   nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
 
-if(WITH_GPU)
+if(WITH_ROCM AND NOT WIN32)
+  hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
     op_library(nccl_op DEPS nccl_common)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
     set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
 endif()
 
-if(NOT WIN32)
+if(WITH_GPU AND NOT WIN32)
     nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
+
+if(WITH_ROCM AND NOT WIN32)
+    hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 558ff4cc09603eebbcd95a234ff1aa63ada7fbb2..01905d8ca84b3bca5fa55dd1dec641fde29804df 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -23,7 +23,11 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#else
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index bdf018db6f883c63bd1fd0c4ecd71c2c2b54cb02..ccf589e8588f614cecbf9518badbf6e4350b4f03 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
-if (WITH_GPU)
+if (WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()
 
-if (WITH_NCCL)
+if (WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif()
@@ -21,7 +21,7 @@ endif()
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
   set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
-  if (WITH_NCCL)
+  if (WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
   endif()
 endif(NOT WIN32)
@@ -71,7 +71,7 @@ if (WITH_PSCORE)
   list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()
 
-if (WITH_NCCL)
+if (WITH_NCCL OR WITH_RCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
 
@@ -81,9 +81,9 @@ if(WITH_PYTHON)
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
-  if(WITH_NCCL)
+  if (WITH_NCCL OR WITH_RCCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
-  endif(WITH_NCCL)
+  endif()
 
   if(WITH_XPU_BKCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context)
@@ -93,6 +93,9 @@ if(WITH_PYTHON)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
+  if(WITH_ROCM)
+    target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
+  endif()
 
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
@@ -164,20 +167,17 @@ if(WITH_PYTHON)
     endif(WITH_MKLDNN)
   endif(WIN32)
 
-  if(WITH_ROCM_PLATFORM)
-    cc_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-  else()
-    cc_library(paddle_pybind SHARED
-      SRCS ${PYBIND_SRCS}
-      DEPS ${PYBIND_DEPS}
-      ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
-    if(NOT APPLE AND NOT WIN32)
-      target_link_libraries(paddle_pybind rt)
-    endif(NOT APPLE AND NOT WIN32)
-  endif(WITH_ROCM_PLATFORM)
+  cc_library(paddle_pybind SHARED
+  SRCS ${PYBIND_SRCS}
+  DEPS ${PYBIND_DEPS}
+  ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+  if(NOT APPLE AND NOT WIN32)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT WIN32)
+
+  if(WITH_ROCM)
+    target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
+  endif()
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index fa44eeb485c27a27a38ccb87637dfba2accb6611..1732cf5bfdee1f5450d551887482b3123921df8f 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
 DECLARE_int32(max_inplace_grad_add);
 DECLARE_string(tracer_profile_fname);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // cudnn
 DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
       FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
       FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8e894fc07a32865aeb0d7bea52d3f1a96e89edc3..21088e06a23afbf6e31bffcbd41372686d9650c3 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
            [](imperative::VarBase &self,
               const imperative::ParallelStrategy &strategy) {
              if (strategy.nranks_ > 1) {
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2212
                imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
 #else
@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
               )DOC")
       .def("pin_memory",
            [](const std::shared_ptr<imperative::VarBase> &self) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot copy this Tensor to pinned memory in CPU version "
                  "Paddle, "
@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
       .def("cuda",
            [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
               bool blocking) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot copy this Tensor to GPU in CPU version Paddle, "
                  "Please recompile or reinstall Paddle with CUDA support."));
@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
              std::shared_ptr<imperative::ParallelContext>>(m,
                                                            "ParallelContext");
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 96acfd7bc040439852bb8f02fce44bfdfebae335..5bff9178fdfa5bff8af8a5f1418d700e463c4422 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -32,7 +32,8 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 void BindPSGPUWrapper(py::module* m) {
   py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
       *m, "PSGPU")
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
index 4048e88a55abca3fdaad20d8e12f6fc3a3e1a02f..8bd6ee13cf50bd2791d044ae3da450a5cf028133 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -22,7 +22,8 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
 void BindPSGPUWrapper(py::module* m);
 #endif
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 750fb6e225803342879695e45fd453fa8ea8acb3..d11f3c005eed50e13a28a7a26178a5077f141c55 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -86,7 +86,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -95,11 +95,13 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#ifdef PADDLE_WITH_CUDA
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/cuda_profiler.h"
+#endif
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
 namespace paddle {
 namespace pybind {
 bool IsCompiledWithCUDA() {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool IsCompiledWithROCM() {
+#ifndef PADDLE_WITH_HIP
   return false;
 #else
   return true;
@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
 
   m.def("set_num_threads", &platform::SetNumThreads);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("cudnn_version", &platform::CudnnVersion);
 #endif
 
@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
     if (dl.ctx.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dl, &tensor);
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (dl.ctx.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dl, &tensor);
     }
@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("height", &SelectedRows::height)
       .def("set_rows",
            [](SelectedRows &self, std::vector<int64_t> rows) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              self.set_rows(rows);
 #else
         Vector<int64_t> new_rows(rows);
@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(
                  platform::errors::PermissionDenied(
                  "Cannot use CUDAPlace in CPU only version, "
@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
           .def_static("create",
                 [](paddle::platform::CUDAPinnedPlace& place)
                         -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(
                  platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
                 });;
 // clang-format on
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
         )DOC")
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
              std::exit(-1);
 #endif
            })
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
         )DOC")
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#ifndef PADDLE_WITH_CUDA
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_devices", []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_rocm", IsCompiledWithROCM);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("supports_bfloat16", SupportsBfloat16);
@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
         py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
         py::arg("redirect_stderr") = false);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
     return platform::GetCUDAComputeCapability(place.device) >= 53;
@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
 
-#ifndef _WIN32
+#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("size_of_dtype", framework::SizeOfType);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
   m.def("get_cublas_switch", platform::AllowTF32Cublas);
   m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
 #endif
-#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
+    (defined PADDLE_WITH_PSLIB)
   BindPSGPUWrapper(&m);
 #endif
   BindGlooWrapper(&m);
@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_BOX_PS
   BindBoxWrapper(&m);
 #endif
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   BindNCCLWrapper(&m);
 #endif
 #ifdef PADDLE_WITH_GLOO
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 5ddb498980d77534823db3e78cb5077004baa9a9..5f25217007017cff963ffb6a7a701f0618b44d79 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
 #endif
   } else if (platform::is_gpu_place(self.place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     const T *a = self.data<T>();
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
 #endif
   } else if (platform::is_gpu_place(self->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
       platform::CUDADeviceGuard guard(
           BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
       auto dst = self->mutable_data<T>(place);
+#ifdef PADDLE_WITH_HIP
+      paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
+                                      hipMemcpyHostToDevice);
+#else
       paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
                                       cudaMemcpyHostToDevice);
+#endif
 
     } else if (paddle::platform::is_cuda_pinned_place(place)) {
       auto dst = self->mutable_data<T>(place);
@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
                          self.type());
 #endif
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
                            self.type());
@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
     PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
                       platform::errors::InvalidArgument(