diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index c8ab2c91e912241457a8ab2f5455ef886b3b18ae..b9ea2ade6cb90b71d423ba977215ab693f19b562 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -73,7 +73,7 @@ class ConditionalOp : public framework::OperatorBase {
                                 ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       framework::LoDTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 2bab8e57916ef92bf57a7b6f1a691052b88b57bf..dec0e789776a43264f00294c700ff3d710d56cb2 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,7 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return platform::GetCUDADeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 904cc214edd0983e269a885013d5ec568e7f7584..c9d4e1510985fbb7025006159705842513a4c54e 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -223,7 +223,7 @@ bool GetCondData(const framework::LoDTensor &cond) {
   }
   // when platform::is_gpu_place(cond.place()) is true
   std::unique_ptr<framework::LoDTensor> cpu_cond{new framework::LoDTensor()};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 1915323f3c324f952c183049759945030e5728f5..efbd653ffd3b0d43f485e02a5c603ae51f1c1a9a 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -40,10 +40,12 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc bo
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
 
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
   set(TMPDEPS memory)
-  if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-      set(TMPDEPS memory cub)
+  if(WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        set(TMPDEPS memory cub)
+    endif()
   endif()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS})
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 0247093d03a9142bfdfb01927b6ad6a75e549778..0d52fd416138273354c2ef6266eed350c91ed54c 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -16,10 +16,16 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
+#ifdef __NVCC__
 #include "cub/cub.cuh"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+#include "paddle/fluid/platform/miopen_helper.h"
+#endif
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
@@ -58,16 +64,27 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
 
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
+#ifdef PADDLE_WITH_HIP
+  hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+#else
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
       nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+#endif
   // Allocate temporary storage
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
-  // Run sorting operation
+// Run sorting operation
+#ifdef PADDLE_WITH_HIP
+  hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
+      idx_out, num);
+#else
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
       d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
       idx_out, num);
+#endif
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 86207052bb2bef4f7bea34c2614fe7686f579de8..4bb0f9ca67fb2bbda43a9a8ec6ea15f118d0ed0c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -9,8 +9,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/fluid/memory/allocation/allocator.h>
+#ifdef __NVCC__
 #include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+#endif
+
+#include <paddle/fluid/memory/allocation/allocator.h>
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -135,17 +141,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
+        nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
+        idx_out, total_roi_num);
+#else
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
         idx_out, total_roi_num);
+#endif
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
-    // Run sorting operation
-    // sort score to get corresponding index
+// Run sorting operation
+// sort score to get corresponding index
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairsDescending<T, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
+        keys_out, idx_in, idx_out, total_roi_num);
+#else
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
         keys_out, idx_in, idx_out, total_roi_num);
+#endif
     index_out_t.Resize({real_post_num});
     Tensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
@@ -167,17 +185,29 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         out_id_t.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
     // Determine temporary device storage requirements
     temp_storage_bytes = 0;
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairs<int, int>(
+        nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
+        batch_idx_in, index_out_t.data<int>(), real_post_num);
+#else
     cub::DeviceRadixSort::SortPairs<int, int>(
         nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
         batch_idx_in, index_out_t.data<int>(), real_post_num);
+#endif
     // Allocate temporary storage
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
-    // Run sorting operation
-    // sort batch_id to get corresponding index
+// Run sorting operation
+// sort batch_id to get corresponding index
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairs<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
+        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
+#else
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
         out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
+#endif
 
     GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 7550ff91fd542877599dd25e341f951bdb6e604b..63f205947d9b5df1354245754c7d3544a6353039 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -12,8 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/fluid/memory/allocation/allocator.h>
+#ifdef __NVCC__
 #include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+#endif
+
+#include <paddle/fluid/memory/allocation/allocator.h>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
@@ -143,24 +149,42 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
+                                                 target_lvls_data, keys_out,
+                                                 idx_in, idx_out, roi_num);
+#else
     cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
                                               target_lvls_data, keys_out,
                                               idx_in, idx_out, roi_num);
+#endif
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
-    // Run sorting operation
-    // sort target level to get corresponding index
+// Run sorting operation
+// sort target level to get corresponding index
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairs<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
+        idx_in, idx_out, roi_num);
+#else
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
         idx_in, idx_out, roi_num);
+#endif
 
     int* restore_idx_data =
         restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
-    // sort current index to get restore index
+// sort current index to get restore index
+#ifdef PADDLE_WITH_HIP
+    hipcub::DeviceRadixSort::SortPairs<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
+        restore_idx_data, roi_num);
+#else
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
         restore_idx_data, roi_num);
+#endif
 
     int start = 0;
     auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index f12d60c8b0fc00742f6fba86aaf55cf12eab82d5..ed1676200dc470a931f4e1d9bb218842a473f716 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "cub/cub.cuh"
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
index da85e4c5e444c478e58a898b334962bd4c5a0045..01b15865e93b6035598b382b506504e9fcc22698 100644
--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -107,7 +107,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
     int64_t k = x->dims()[2];
 
     auto x_lod = x->lod().back();
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
 #else
     size_t* x_lod_data = x_lod.data();
@@ -129,7 +129,7 @@ class TargetAssignKernel : public framework::OpKernel<T> {
               "TargetAssignOp input(NegIndices) needs 1 level of LoD"));
       const int* neg_idx_data = neg_indices->data<int>();
       auto neg_lod = neg_indices->lod().back();
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
 #else
       size_t* neg_lod_data = neg_lod.data();
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 1417676426c2b63990c4ca92eaac6b991c49dbad..c9db6148bc45d44e8336fc662ae1b4337445f738 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -61,7 +61,7 @@ cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
 cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
 cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
         selected_rows_functor  scope math_function)
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
index d66281ac7c7ae518d869cb44b1240cb6d1af6563..411c0f36debd3b66014648f3bcad395d1b4a3579 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif
+#ifdef PADDLE_WITH_RCCL
+#include <rccl.h>
+#endif
 #include <sys/time.h>
 #include <limits>
 #include <memory>
@@ -144,7 +147,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
   } else if (var->IsType<framework::SelectedRows>()) {
     request->set_type(::sendrecv::SELECTED_ROWS);
     payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   } else if (var->IsType<ncclUniqueId>()) {
     request->set_type(::sendrecv::NCCL_ID);
     const ncclUniqueId& uid = var->Get<ncclUniqueId>();
@@ -172,7 +175,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
         static_cast<const char*>(payload->ptr()), payload->memory_size());
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       IOBufWriter::AppendZeroCopy(
           name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
           static_cast<const char*>(payload->ptr()), payload->memory_size(),
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
index b902d3db487789a417ed0e5ffc032e7e06ba43fb..bcf20ad076b11f86870d8fbe980b8adad5c96ea8 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
@@ -159,7 +159,7 @@ void RunTestLodTensor(platform::Place place) {
 TEST(LodTensor, Run) {
   platform::CPUPlace place;
   RunTestLodTensor(place);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace gpu(0);
   RunTestLodTensor(gpu);
 #endif
@@ -168,7 +168,7 @@ TEST(LodTensor, Run) {
 TEST(SelectedRows, Run) {
   platform::CPUPlace place;
   RunSerdeTestSelectedRows(place);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace gpu;
   RunSerdeTestSelectedRows(gpu);
 #endif
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index 13343ed4a78dd6944a0b0b7c0fbb6e11f929b7e1..0fc9b6957791490192a142321792b82cf906bfc9 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
 #endif
+#ifdef PADDLE_WITH_RCCL
+#include <rccl.h>
+#endif
 #include <limits>
 #include <memory>
 #include "grpcpp/impl/codegen/byte_buffer.h"
@@ -75,7 +78,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   } else if (var->IsType<framework::SelectedRows>()) {
     request.set_type(::sendrecv::SELECTED_ROWS);
     payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   } else if (var->IsType<ncclUniqueId>()) {
     request.set_type(::sendrecv::NCCL_ID);
 #endif
@@ -91,7 +94,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   e.WriteRawBytes(std::string(header.data(), header.size()));
 // NCCLID is copied directly to the message, return bytebuffer
 // with only one slice if serializing NCCLID.
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (var->IsType<ncclUniqueId>()) {
     e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                               NCCL_UNIQUE_ID_BYTES);
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
index 749c1bf39a48608876c77a74aa98be51947cf3b3..d407a72938a741d9451f46ba067e639e3ada6544 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
@@ -206,7 +206,7 @@ TEST(LodTensor, Run) {
   platform::CPUPlace place;
   RunTestLodTensor(place);
   RunTestLodTensor(place, 1);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace gpu(0);
   RunTestLodTensor(gpu);
   RunTestLodTensor(gpu, 1);
@@ -217,7 +217,7 @@ TEST(SelectedRows, Run) {
   platform::CPUPlace place;
   RunSerdeTestSelectedRows(place);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace gpu;
   RunSerdeTestSelectedRows(gpu);
 #endif
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index df47422fc059f562da644efd79a39919229ebca5..558d70e5c3353f98c052cfbcd108f859ea03e7a3 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -281,7 +281,7 @@ void prefetchs(const std::vector<std::string> &id_var_names,
         }
       }
     } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       std::vector<float> ids_value_vec(ids_size * vec_dim_1);
       for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
         const auto &id = ids[idx];
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 39b4b3daf8c8cb8783b0fde71a19f691f3249abe..107c74eb2670e4c83184167e7758119e95e72cf9 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -39,7 +39,7 @@ using VarMsg = sendrecv::VariableMessage;
 static TensorPayload GetCommunicationAllocationFromTensor(
     const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
   if (is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_EQ(
         is_gpu_place(tensor.place()), true,
         platform::errors::PreconditionNotMet("Please run in gpu place."));
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 4c161f044d8d749f0f969613b53a13c9aa1aea23..79b0843968e8578bc1dd865d6c2530c473a57bbe 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -33,7 +33,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
   int total_written = 0;
 
   if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto& gpu_dev_ctx =
         static_cast<const platform::CUDADeviceContext&>(dev_ctx);
     platform::CPUPlace cpu;
@@ -62,7 +62,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     gpu_dev_ctx.Wait();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Unexpected branch, please compile with PADDLE_WITH_CUDA"));
+        "Unexpected branch, please compile with WITH_GPU or WITH_ROCM"));
 #endif
     return true;
   } else if (platform::is_xpu_place(place)) {
@@ -221,7 +221,7 @@ bool VariableResponse::ProcSerializedField(
       platform::errors::PreconditionNotMet("meta info should be got first!"));
 
   if (meta_.type() == sendrecv::NCCL_ID) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* var = scope_->FindVar(meta_.varname());
     if (var != nullptr) {
       ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu
index ab5ee745aaf8b68dc5bc9ef06f2be05d7fbbfb99..3d22fc60993c7b1cd9b78b6ab175d0b3621d0949 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cu
+++ b/paddle/fluid/operators/metrics/accuracy_op.cu
@@ -43,8 +43,19 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
   total[threadIdx.x] = count;
   __syncthreads();
 
-  // reduce the count with init value 0, and output accuracy.
+// reduce the count with init value 0, and output accuracy.
+#ifdef PADDLE_WITH_CUDA
   int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+#else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+#endif
   if (threadIdx.x == 0) {
     *correct_data = result;
     *accuracy = static_cast<float>(result) / static_cast<float>(N);
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
index 13da4ff0857d97d61ae8d4da9b05b0f27128d94e..40609381c17aee7d8dce52b30b9110ff8b39860d 100644
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ b/paddle/fluid/operators/metrics/auc_op.cu
@@ -130,6 +130,7 @@ class AucCUDAKernel : public framework::OpKernel<T> {
     auto *pos_in_data = stat_pos_in_tensor->data<int64_t>();
     auto *stat_neg_in_tensor = ctx.Input<Tensor>("StatNeg");
     auto *neg_in_data = stat_neg_in_tensor->data<int64_t>();
+#ifdef PADDLE_WITH_CUDA
     if (stat_pos_in_tensor != stat_pos) {
       cudaMemcpy(origin_stat_pos, pos_in_data,
                  ((1 + slide_steps) * (num_thresholds + 1) +
@@ -144,6 +145,22 @@ class AucCUDAKernel : public framework::OpKernel<T> {
                      sizeof(int64_t),
                  cudaMemcpyDeviceToDevice);
     }
+#else
+    if (stat_pos_in_tensor != stat_pos) {
+      hipMemcpy(origin_stat_pos, pos_in_data,
+                ((1 + slide_steps) * (num_thresholds + 1) +
+                 (slide_steps > 0 ? 1 : 0)) *
+                    sizeof(int64_t),
+                hipMemcpyDeviceToDevice);
+    }
+    if (stat_neg_in_tensor != stat_neg) {
+      hipMemcpy(origin_stat_neg, neg_in_data,
+                ((1 + slide_steps) * (num_thresholds + 1) +
+                 (slide_steps > 0 ? 1 : 0)) *
+                    sizeof(int64_t),
+                hipMemcpyDeviceToDevice);
+    }
+#endif
 
     statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos,
             origin_stat_neg);