diff --git a/paddle/fluid/extension/include/ext_place.h b/paddle/fluid/extension/include/ext_place.h
index 91d4f41c2135145062fb98b2df77b14ec4e1a32b..c9ed40a382417f6a85c4213f1d5d965797863c92 100644
--- a/paddle/fluid/extension/include/ext_place.h
+++ b/paddle/fluid/extension/include/ext_place.h
@@ -17,6 +17,6 @@ limitations under the License. */
 namespace paddle {
 
 // TODO(yangjiabin): Add other place support in next PR
-enum class PlaceType { kUNK = -1, kCPU, kGPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU, kHIP };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index fa91490e6cd8af7c3a4ff4b2b3013b519c59aa2a..d40503409fbdc1dafe5476c742521da7792f0211 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -116,9 +116,11 @@ class PD_DLL_DECL Tensor {
   /// \brief Check Tensor is initialized
   bool is_initialized() const;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
   /// \bref Get current stream of Tensor
   cudaStream_t stream() const;
+#elif defined(PADDLE_WITH_HIP)
+  hipStream_t stream() const;
 #endif
 
  private:
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index ab98bdc0bfb47e07e5742ac1ee9cebe60f5c7a69..a9e286b4f9b231091c71cb06a2bc3996ab0fbbed 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -53,7 +53,7 @@ struct CastDataType {
       auto *context = static_cast<const platform::CPUDeviceContext *>(ctx_);
       trans(*context, in_begin, in_end, out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     } else if (platform::is_gpu_place(in_.place())) {
       platform::Transform<platform::CUDADeviceContext> trans;
       auto *context = static_cast<const platform::CUDADeviceContext *>(ctx_);
@@ -67,10 +67,11 @@ struct CastDataType {
     }
   }
 };
+
 template <typename T>
-void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
-             int64_t ele_size) {
-#ifdef PADDLE_WITH_CUDA
+void DeviceCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
+                int64_t ele_size) {
+#if defined(PADDLE_WITH_CUDA)
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   int device_num = paddle::platform::GetCurrentDeviceId();
   platform::CUDAPlace gpu_place(device_num);
@@ -90,6 +91,30 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
         "Only GPU related Copy can reach this func."));
   }
   cudaStreamSynchronize(dev_ctx->stream());
+#elif defined(PADDLE_WITH_HIP)
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  int device_num = paddle::platform::GetCurrentDeviceId();
+  platform::CUDAPlace gpu_place(device_num);
+  auto *dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+  if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kCPU)) {
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(dst), gpu_place, src,
+                 ele_size, dev_ctx->stream());
+  } else if ((src_plc == PlaceType::kHIP) && (dst_plc == PlaceType::kHIP)) {
+    memory::Copy(gpu_place, static_cast<void *>(dst), gpu_place, src, ele_size,
+                 dev_ctx->stream());
+  } else if ((src_plc == PlaceType::kCPU) && (dst_plc == PlaceType::kHIP)) {
+    memory::Copy(gpu_place, static_cast<void *>(dst), platform::CPUPlace(), src,
+                 ele_size, dev_ctx->stream());
+  } else {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Only GPU related Copy can reach this func."));
+  }
+  hipStreamSynchronize(dev_ctx->stream());
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "This function can only be used if compiled with"
+      "either -DWITH_ROCM=ON or -DWITH_GPU=ON"));
 #endif
 }
 
@@ -137,11 +162,16 @@ T *Tensor::mutable_data() {
     case static_cast<int>(PlaceType::kCPU): {
       return tensor->mutable_data<T>(platform::CPUPlace());
     }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA)
     case static_cast<int>(PlaceType::kGPU): {
       int device_num = platform::GetCurrentDeviceId();
       return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
     }
+#elif defined(PADDLE_WITH_HIP)
+    case static_cast<int>(PlaceType::kHIP): {
+      int device_num = platform::GetCurrentDeviceId();
+      return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
+    }
 #endif
     default:
       PADDLE_THROW(platform::errors::Unavailable(
@@ -202,17 +232,23 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   target.reshape(shape());
   auto *p_target_data = target.template mutable_data<T>();
 
+  bool supported_gpu_transform = false;
+#if defined(PADDLE_WITH_CUDA)
+  supported_gpu_transform =
+      (src_place == PlaceType::kGPU && target_place == PlaceType::kCPU) ||
+      (src_place == PlaceType::kCPU && target_place == PlaceType::kGPU) ||
+      (src_place == PlaceType::kGPU && target_place == PlaceType::kGPU);
+#elif defined(PADDLE_WITH_HIP)
+  supported_gpu_transform =
+      (src_place == PlaceType::kHIP && target_place == PlaceType::kCPU) ||
+      (src_place == PlaceType::kCPU && target_place == PlaceType::kHIP) ||
+      (src_place == PlaceType::kHIP && target_place == PlaceType::kHIP);
+#endif
+
   if ((src_place == PlaceType::kCPU) && (target_place == PlaceType::kCPU)) {
     std::memcpy(static_cast<void *>(p_target_data), p_src_data, ele_size);
-  } else if ((src_place == PlaceType::kGPU) &&
-             (target_place == PlaceType::kCPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
-  } else if ((src_place == PlaceType::kCPU) &&
-             (target_place == PlaceType::kGPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
-  } else if ((src_place == PlaceType::kGPU) &&
-             (target_place == PlaceType::kGPU)) {
-    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if (supported_gpu_transform) {
+    DeviceCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
         "Not supported place transform of place: %d to place: %d",
@@ -304,13 +340,18 @@ const PlaceType &Tensor::place() const {
   GET_CASTED_TENSOR;
   if (platform::is_cpu_place(tensor->place())) {
     place_ = PlaceType::kCPU;
+#if defined(PADDLE_WITH_CUDA)
   } else if (platform::is_gpu_place(tensor->place())) {
     place_ = PlaceType::kGPU;
+#elif defined(PADDLE_WITH_HIP)
+  } else if (platform::is_gpu_place(tensor->place())) {
+    place_ = PlaceType::kHIP;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Current Tensor hold unsupported Place Type, Please Init it"
-        "using Tensor::mutable_data<T>(PaddlePlace) which T is"
-        "either Place::kCPU or Place::kGPU"));
+        "using Tensor::mutable_data<T>(PaddlePlace) with T among:"
+        "Place::kCPU or Place::kGPU or Place::kHIP"));
   }
   return place_;
 }
@@ -392,16 +433,21 @@ bool Tensor::is_initialized() const {
   }
 }
 
-#ifdef PADDLE_WITH_CUDA
-cudaStream_t Tensor::stream() const {
-  if (!stream_.IsStreamSet()) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Stream is not Set, only input tensor will have "
-        "stream which is set by framework "));
-  } else {
-    return reinterpret_cast<cudaStream_t>(stream_.GetStream());
+#define DEFINE_STREAM(_stream_t_)                               \
+  _stream_t_ Tensor::stream() const {                           \
+    if (!stream_.IsStreamSet()) {                               \
+      PADDLE_THROW(platform::errors::PreconditionNotMet(        \
+          "Stream is not Set, only input tensor will have "     \
+          "stream which is set by framework "));                \
+    } else {                                                    \
+      return reinterpret_cast<_stream_t_>(stream_.GetStream()); \
+    }                                                           \
   }
-}
+
+#if defined(PADDLE_WITH_CUDA)
+DEFINE_STREAM(cudaStream_t)
+#elif defined(PADDLE_WITH_HIP)
+DEFINE_STREAM(hipStream_t)
 #endif
 
 namespace framework {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 284c438e395d38a0e8f98a564320adf92affe6d6..ed4d042062e368105d1e18d69419a001e94c4163 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -403,10 +403,20 @@ configure_file(commit.h.in commit.h)
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
 
-cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
+if(WITH_ROCM)
+  hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
+else()
+  cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
+endif()
+
 cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
-cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
+
+if(WITH_ROCM)
+  hip_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
+else()
+  cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
+endif()
 
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 733831263a184f5060cca58c26866ac3350c155c..b2896c74c39a6e27822446029a7884164369bb4f 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -45,7 +45,19 @@ void TestCopyTensor() {
   auto t1_gpu_cp_cp = t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kGPU);
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp_cp.place()));
   auto t1_gpu_cp_cp_cpu =
-      t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kCPU);
+      t1_gpu_cp_cp.template copy_to<T>(paddle::PlaceType::kCPU);
+  CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
+  for (int64_t i = 0; i < t1.size(); i++) {
+    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
+  }
+#elif defined(PADDLE_WITH_HIP)
+  VLOG(2) << "Do HIP copy test";
+  auto t1_gpu_cp = t1_cpu_cp.template copy_to<T>(paddle::PlaceType::kHIP);
+  CHECK((paddle::PlaceType::kHIP == t1_gpu_cp.place()));
+  auto t1_gpu_cp_cp = t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kHIP);
+  CHECK((paddle::PlaceType::kHIP == t1_gpu_cp_cp.place()));
+  auto t1_gpu_cp_cp_cpu =
+      t1_gpu_cp_cp.template copy_to<T>(paddle::PlaceType::kCPU);
   CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
@@ -60,6 +72,11 @@ void TestAPIPlace() {
   t1.reshape(tensor_shape);
   t1.mutable_data<float>();
   CHECK((paddle::PlaceType::kGPU == t1.place()));
+#elif defined(PADDLE_WITH_HIP)
+  auto t1 = paddle::Tensor(paddle::PlaceType::kHIP);
+  t1.reshape(tensor_shape);
+  t1.mutable_data<float>();
+  CHECK((paddle::PlaceType::kHIP == t1.place()));
 #endif
   auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
   t2.reshape(tensor_shape);