diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 410889cfb9e7e5530e385ee0f9d43283652ebcdb..47344f0e3733d6ce400145755e995d45591d2eef 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -52,7 +52,12 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+IF(WITH_GPU)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()
 
 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -72,7 +77,7 @@ IF(WITH_DGC)
     set(dgc_deps dgc)
 ENDIF()
 
-IF(WITH_GPU)
+IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
 ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
-
-nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
+ENDIF()
+
+IF(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 ELSE()
   set(STREAM_CALLBACK_DEPS)
@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
-nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+if(WITH_GPU)
+  nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+  nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+  nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()
+
+if(WITH_ROCM)
+  hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
+  hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
+  hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()
 
 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)
@@ -127,25 +145,34 @@ if(WITH_GPU)
   nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
   nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+elseif(WITH_ROCM)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
   cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
   cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-
 cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
-nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+IF(WITH_GPU)
+  nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()
 
-nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+IF(WITH_ROCM)
+  hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()
 
 if(NOT APPLE AND NOT WIN32)
   cc_library(device_code SRCS device_code.cc DEPS device_context)
-  if(WITH_GPU)
+  if(WITH_GPU OR WITH_ROCM)
     cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
   endif()
 endif()
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 0ef3a1844854475f5d2419ebdc837fdc4674bedc..4b16a67b235fdd153d8a8fa9ba6bf6c848abe877 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -13,10 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLCommImpl : public NCCLComm {
  public:
   void set_ring_id(int ring_id) { ring_id_ = ring_id; }
@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
   void set_comm(ncclComm_t comm) { comm_ = comm; }
   ncclComm_t comm() const override { return comm_; }
 
-  cudaStream_t stream() const override { return dev_ctx_->stream(); }
+  gpuStream_t stream() const override { return dev_ctx_->stream(); }
 
   void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
     dev_ctx_ = std::move(dev_ctx);
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 0cd501da428bc5bfe1a858e920d5a6bc6f20c69f..8a6719ab685b847c8aee0fc25a21477a44233c38 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -27,7 +27,7 @@
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
 // ncclUniqueId. E.g. for a hierarchical case,
@@ -56,7 +56,7 @@ class NCCLComm {
   virtual int rank() const = 0;
   virtual int device_id() const = 0;
   virtual ncclComm_t comm() const = 0;
-  virtual cudaStream_t stream() const = 0;
+  virtual gpuStream_t stream() const = 0;
   virtual CUDADeviceContext* dev_context() const = 0;
   virtual ~NCCLComm() = default;
 };
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index a70050bae113d4343b0f5d5ca569fa00ef479fe8..4f504b414de4a7e8413fcb8f853c07d55ecff493 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,10 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -25,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+#ifdef PADDLE_WITH_HIP
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+#else
 #if CUDA_VERSION < 9000
 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
 #else
@@ -32,6 +33,7 @@ namespace platform {
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
+#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
   if (dim > 512) {
@@ -67,7 +69,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl_xor(val, width);
 #else
   return __shfl_xor_sync(mask, val, width);
@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 }
 
 // CUDA 9.0 have native compatible float16 shfl_down
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_down(static_cast<float>(val),
+                             static_cast<unsigned>(delta), width));
+#else
   return float16(
       __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+#endif
 }
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_xor(static_cast<float>(val), width));
+#else
   return float16(__shfl_xor(static_cast<half>(val), width));
+#endif
 }
 #else
 template <>
@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
   return __shfl(val, src_line, width);
 #else
   return __shfl_sync(mask, val, src_line, width);
@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {
 
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
+// NOTE(zcd): The warp size should be taken from the
+// parameters of the GPU but not specified as 32 simply.
+// To make the reduceSum more efficiently,
+// I use Warp-Level Parallelism and assume the Warp size
+// is 32 which may be different for different GPU,
+// but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
   const int warpSize = 32;
+#endif
   __shared__ T shm[warpSize];
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, tid < len);
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 9357d5db17cd1c38f6ffeb92a8820d5eeee809dd..ef0e3a72d1a67302edefe2e8ddc594e23286da9e 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -16,11 +16,16 @@
 
 #include <mutex>  // NOLINT
 
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
 enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
 #endif
 
@@ -77,6 +82,12 @@ namespace platform {
 
 class CublasHandleHolder {
  public:
+#ifdef PADDLE_WITH_HIP
+  explicit CublasHandleHolder(hipStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
+  }
+#else
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
@@ -92,9 +103,14 @@ class CublasHandleHolder {
     }
 #endif  // CUDA_VERSION >= 9000
   }
+#endif
 
   ~CublasHandleHolder() PADDLE_MAY_THROW {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
+#else
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
+#endif
   }
 
   template <typename Callback>
@@ -106,7 +122,11 @@ class CublasHandleHolder {
  private:
   DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
 
+#ifdef PADDLE_WITH_HIP
+  rocblas_handle handle_;
+#else
   cublasHandle_t handle_;
+#endif
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 044f4d6748e3ad72c097c317784fa2b6b9775bcd..fd46aa239340351b5e5464af4a19dd7a04296914 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -47,8 +47,13 @@ void TestCase(size_t num) {
   T *in1, *in2, *out;
   T *d_in1, *d_in2;
   size_t size = sizeof(T) * num;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), size);
+#else
   cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
   cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+#endif
   in1 = reinterpret_cast<T*>(malloc(size));
   in2 = reinterpret_cast<T*>(malloc(size));
   out = reinterpret_cast<T*>(malloc(size));
@@ -58,12 +63,22 @@ void TestCase(size_t num) {
     in1[i] = static_cast<T>(dist(engine));
     in2[i] = static_cast<T>(dist(engine));
   }
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<T>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
+  hipDeviceSynchronize();
+  hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
   cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
   AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
   cudaDeviceSynchronize();
   cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
   cudaDeviceSynchronize();
+#endif
   for (size_t i = 0; i < num; ++i) {
     // NOTE(dzhwinter): the float16 add has small underflow/overflow
     // so we use EXPECT_NEAR to check the result.
@@ -73,8 +88,13 @@ void TestCase(size_t num) {
   free(in1);
   free(in2);
   free(out);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
   cudaFree(d_in1);
   cudaFree(d_in2);
+#endif
 }
 
 // cuda primitives
@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
   size_t size = sizeof(uint8_t) * (num + shift_bit);
   size_t array_size = sizeof(float16) * (num / 2);
 
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), size);
+#else
   cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
   cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+#endif
   in1 = reinterpret_cast<float16*>(malloc(size));
   in2 = reinterpret_cast<float16*>(malloc(size));
   out = reinterpret_cast<float16*>(malloc(size));
@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
     r_in1[i] = static_cast<float16>(dist(engine));
     r_in2[i] = static_cast<float16>(dist(engine));
   }
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, r_in1, array_size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, r_in2, array_size, hipMemcpyHostToDevice);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<float16>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2,
+                     num / 2);
+  hipDeviceSynchronize();
+  hipMemcpy(out, d_in2, array_size, hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
   cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
   AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
   cudaDeviceSynchronize();
   cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
   cudaDeviceSynchronize();
+#endif
   for (size_t i = 0; i < num / 2; ++i) {
     // NOTE(dzhwinter): the float16 add has small truncate error.
     // so we use EXPECT_NEAR to check the result.
@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
   free(in1);
   free(in2);
   free(out);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
   cudaFree(d_in1);
   cudaFree(d_in2);
+#endif
 }
 
 TEST(CudaAtomic, float16Unalign) {
@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
   T* in1;
   T *d_in1, *d_in2;
   size_t size = sizeof(T) * num;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+#else
   cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
   cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+#endif
   in1 = reinterpret_cast<T*>(malloc(size));
   std::minstd_rand engine;
   std::uniform_real_distribution<double> dist(0.0, 1.0);
@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
     in1[i] = static_cast<T>(dist(engine));
   }
   auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipDeviceSynchronize();
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(DeviceReduceSum<T>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
+  hipMemcpy(in1, d_in2, sizeof(T), hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
   cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
   cudaDeviceSynchronize();
   DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
   cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
   cudaDeviceSynchronize();
+#endif
   // NOTE(dzhwinter): the float16 add has small underflow/overflow
   // so we use EXPECT_NEAR to check the result.
   EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
   free(in1);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
   cudaFree(d_in1);
   cudaFree(d_in2);
+#endif
 }
 
 TEST(CudaShuffleSync, float16) {
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 72430a3f75323c2d81d0e83b95361e601080c423..340372007a77b0faa61f027424b843b74afea4f3 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 #include <stdio.h>
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
   if (*address >= val) {
-    return;
+    return *address;
   }
 
   unsigned long long int old = *address, assumed;  // NOLINT
@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
 
 CUDA_ATOMIC_WRAPPER(Max, float) {
   if (*address >= val) {
-    return;
+    return *address;
   }
 
   int *const address_as_i = reinterpret_cast<int *>(address);
@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
 
 CUDA_ATOMIC_WRAPPER(Max, double) {
   if (*address >= val) {
-    return;
+    return *address;
   }
 
   unsigned long long int *const address_as_ull =            // NOLINT
@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
   if (*address <= val) {
-    return;
+    return *address;
   }
 
   unsigned long long int old = *address, assumed;  // NOLINT
@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
 
 CUDA_ATOMIC_WRAPPER(Min, float) {
   if (*address <= val) {
-    return;
+    return *address;
   }
 
   int *const address_as_i = reinterpret_cast<int *>(address);
@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
 
 CUDA_ATOMIC_WRAPPER(Min, double) {
   if (*address <= val) {
-    return;
+    return *address;
   }
 
   unsigned long long int *const address_as_ull =            // NOLINT
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc
index 6ecb312d72072c7904430e52acb77944abd04417..70d2ec55057988eb72f92362c235e9a3879b3579 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
-      cudaStream_t stream;
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+#endif
       return stream;
     };
 
-    auto deleter = [dev_idx](cudaStream_t stream) {
+    auto deleter = [dev_idx](gpuStream_t stream) {
       platform::SetDeviceId(dev_idx);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
+#endif
     };
 
     pool_.emplace_back(
@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
   for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
     auto creator = [dev_idx] {
       platform::SetDeviceId(dev_idx);
-      cudaEvent_t event;
+      gpuEvent_t event;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+#endif
       return event;
     };
 
-    auto deleter = [dev_idx](cudaEvent_t event) {
+    auto deleter = [dev_idx](gpuEvent_t event) {
       platform::SetDeviceId(dev_idx);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+#endif
     };
 
     pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));
diff --git a/paddle/fluid/platform/cuda_resource_pool.h b/paddle/fluid/platform/cuda_resource_pool.h
index 570b68b08fc1e6fd0b9ff70dba244c355fc2618d..2ac13e692f783742aa5bad903aaae416b605a2f1 100644
--- a/paddle/fluid/platform/cuda_resource_pool.h
+++ b/paddle/fluid/platform/cuda_resource_pool.h
@@ -14,9 +14,17 @@
 
 #pragma once
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <memory>
 #include <type_traits>
 #include <vector>
@@ -26,8 +34,8 @@
 namespace paddle {
 namespace platform {
 
-using CudaStreamObject = std::remove_pointer<cudaStream_t>::type;
-using CudaEventObject = std::remove_pointer<cudaEvent_t>::type;
+using CudaStreamObject = std::remove_pointer<gpuStream_t>::type;
+using CudaEventObject = std::remove_pointer<gpuEvent_t>::type;
 
 class CudaStreamResourcePool {
  public:
diff --git a/paddle/fluid/platform/cudnn_desc_test.cc b/paddle/fluid/platform/cudnn_desc_test.cc
index 0adbc7e4af26700f4e9a6e2e16a2cd39b506e50f..db5362f5cb1f5d4c84d6816c1672340c7f22cb2f 100644
--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/cudnn_desc_test.cc
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_desc.h"
+#else
 #include "paddle/fluid/platform/cudnn_desc.h"
+#endif
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index 0975d990b473a2ad3f3b8c83d3755771e24d6d9e..a4226dabf9d52465ba935ce1d631a9254ad27172 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/device_code.h"
-
 #include <sys/stat.h>
+#include <algorithm>
 #include <set>
+#include <utility>
 
+#include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DECLARE_string(cuda_dir);
@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
   }
   for (auto& p : set) {
     if (is_gpu_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "CUDAPlace is not supported, please re-compile with WITH_GPU=ON."));
+          "CUDAPlace or HIPPlace is not supported, please re-compile with "
+          "WITH_GPU=ON or WITH_ROCM=ON."));
 #endif
     }
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   CUDADeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+static bool CheckCUDADriverResult(hipError_t result, std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != hipSuccess) {
+    const char* error = nullptr;
+    error = dynload::hipGetErrorString(result);
+#else
 static bool CheckCUDADriverResult(CUresult result, std::string caller,
                                   std::string kernel_name = "") {
   if (result != CUDA_SUCCESS) {
     const char* error = nullptr;
     dynload::cuGetErrorString(result, &error);
+#endif
     LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
                             << " > failed: " << error << " (" << result << ")";
     return false;
@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {
 
   int nvrtc_major = 0;
   int nvrtc_minor = 0;
+#ifdef PADDLE_WITH_HIP
+  hiprtcResult nvrtc_result =
+      dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
+#else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
+#endif
 
   int driver_version = 0;
   int dirver_major = 0;
   int driver_minor = 0;
+#ifdef PADDLE_WITH_HIP
+  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
+  if (driver_result == hipSuccess) {
+#else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
+#endif
     dirver_major = driver_version / 1000;
     driver_minor = (driver_version % 1000) / 10;
   }
@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
   LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
                        << driver_minor << "; NVRTC Version: " << nvrtc_major
                        << "." << nvrtc_minor;
+#ifdef PADDLE_WITH_HIP
+  if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
+#else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
+#endif
     return;
   }
 
   int count = 0;
+#ifdef PADDLE_WITH_HIP
+  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
+                            "hipGetDeviceCount")) {
+#else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
+#endif
     available_ = true;
   }
 }
@@ -163,14 +192,20 @@ static std::string FindCUDAIncludePath() {
     }
   }
 
+#ifdef PADDLE_WITH_HIP
+  cuda_include_path = "/opt/rocm/include";
+#else
   cuda_include_path = "/usr/local/cuda/include";
+#endif
+
   if (stat(cuda_include_path.c_str(), &st) == 0) {
     return cuda_include_path;
   }
-  LOG(WARNING) << "Cannot find CUDA include path."
-               << "Please check whether CUDA is installed in the default "
-                  "installation path, or specify it by export "
-                  "FLAGS_cuda_dir=xxx.";
+  LOG(WARNING)
+      << "Cannot find CUDA or ROCM include path."
+      << "Please check whether CUDA or ROCM is installed in the default "
+         "installation path, or specify it by export "
+         "FLAGS_cuda_dir=xxx.";
   return "";
 }
 
@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
 
   place_ = place;
   name_ = name;
+#ifdef PADDLE_WITH_HIP
+  kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
+#else
   kernel_ = kernel;
+#endif
 }
 
 bool CUDADeviceCode::Compile(bool include_path) {
@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
         << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
     return false;
   }
+#ifdef PADDLE_WITH_HIP
+  hiprtcProgram program;
+  if (!CheckNVRTCResult(dynload::hiprtcCreateProgram(&program,
+                                                     kernel_.c_str(),  // buffer
+                                                     name_.c_str(),    // name
+                                                     0,         // numHeaders
+                                                     nullptr,   // headers
+                                                     nullptr),  // includeNames
+                        "hiprtcCreateProgram")) {
+    return false;
+  }
 
+  // Compile the program for specified compute_capability
+  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
+      DeviceContextPool::Instance().Get(place_));
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindCUDAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  hiprtcResult compile_result =
+      dynload::hiprtcCompileProgram(program,          // program
+                                    options.size(),   // numOptions
+                                    options.data());  // options
+  if (compile_result == HIPRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::hiprtcGetProgramLogSize(program, &log_size),
+                          "hiprtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::hiprtcGetProgramLog(program, log.data()),
+                          "hiprtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of ROCM GPU code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program for cuda
+  // Obtain Code from the program for hip
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::hiprtcGetCodeSize(program, &ptx_size),
+                        "hiprtcGetCodeSize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::hiprtcGetCode(program, ptx_.data()),
+                        "hiprtcGetCode")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::hiprtcDestroyProgram(&program),
+                        "hiprtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::hipModuleLoadData(&module_, ptx_.data()),
+                             "hipModuleLoadData")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::hipModuleGetFunction(&function_, module_, name_.c_str()),
+          "hipModuleGetFunction")) {
+    return false;
+  }
+#else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
                                                     kernel_.c_str(),  // buffer
@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
           "cuModuleGetFunction", name_)) {
     return false;
   }
+#endif
 
   max_threads_ = dev_ctx->GetMaxPhysicalThreadCount();
   is_compiled_ = true;
@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
 
   auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
       DeviceContextPool::Instance().Get(place_));
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_EQ(
+      dynload::hipModuleLaunchKernel(function_, num_blocks, 1, 1,  // grid dim
+                                     num_threads_, 1, 1,           // block dim
+                                     0,                  // shared memory
+                                     dev_ctx->stream(),  // stream
+                                     args->data(),       // arguments
+                                     nullptr),
+      hipSuccess,
+      errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
+                       name_.c_str()));
+#else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_, num_blocks, 1, 1,  // grid dim
                               num_threads_, 1, 1,           // block dim
@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       CUDA_SUCCESS,
       errors::External("Fail to launch kernel %s (in cuLaunchKernel.)",
                        name_.c_str()));
+#endif
 }
 
+#ifdef PADDLE_WITH_HIP
+bool CUDADeviceCode::CheckNVRTCResult(hiprtcResult result,
+                                      std::string function) {
+  if (result != HIPRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::hiprtcGetErrorString(result);
+    return false;
+  }
+#else
 bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
                                       std::string function) {
   if (result != NVRTC_SUCCESS) {
@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
+#endif
   return true;
 }
 #endif
diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h
index 4199317a8ceb03dddc5cc439dc455725b8863678..6b1c284abbd7e1a601842f160f8dcbc0d365059a 100644
--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -25,6 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #include "paddle/fluid/platform/dynload/nvrtc.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hiprtc.h"
+#include "paddle/fluid/platform/dynload/rocm_driver.h"
+#endif
 
 namespace paddle {
 namespace platform {
@@ -44,7 +48,7 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDADeviceCode : public DeviceCode {
  public:
   explicit CUDADeviceCode(const Place& place, const std::string& name,
@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
   static bool IsAvailable() { return available_; }
 
  private:
+#ifdef PADDLE_WITH_HIP
+  bool CheckNVRTCResult(hiprtcResult result, std::string function);
+#else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
+#endif
 
   static bool available_;
 
@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
   int num_threads_{1024};
   int workload_per_thread_{1};
   std::vector<char> ptx_;
+#ifdef PADDLE_WITH_HIP
+  hipModule_t module_;
+  hipFunction_t function_;
+#else
   CUmodule module_;
   CUfunction function_;
+#endif
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
index bb4fceb85de0aec8bc2a4c6231183b747f8306b5..aadfffb59133bfb3c0c77599dcec891acbeab689 100644
--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device_code.h"
+#include <utility>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"
 
+#ifdef PADDLE_WITH_CUDA
 constexpr auto saxpy_code = R"(
 extern "C" __global__
 void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
   }
 }
 )";
+#endif
 
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+constexpr auto saxpy_code = R"(
+#include <hip/hip_runtime.h>
+extern "C" __global__
+void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
+  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
+       tid += blockDim.x * gridDim.x) {
+    z[tid] = a * x[tid] + y[tid];
+  }
+}
+)";
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(DeviceCode, cuda) {
   if (!paddle::platform::dynload::HasNVRTC() ||
       !paddle::platform::dynload::HasCUDADriver()) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a04214c7014650f4c97428d5ba18dd874025cc79..c5fb46833f76027fa82895abf5059e4d3931afab 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,7 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
   }
 
   if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
         platform::DeviceContextPool::Instance().Get(place));
     auto& desired_dev_ctx =
@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
       EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
 #endif
     } else if (platform::is_gpu_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
       PADDLE_THROW(
@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
                                           "re-compile with WITH_GPU option."));
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
           &device_contexts_, p);
 #else
@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   }
   ~EigenCudaStreamDevice() override {}
 
-  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
+  void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
     stream_ = cuda_stream;
     place_ = place;
     device_prop_ = &Eigen::m_deviceProperties[place.device];
   }
 
-  const cudaStream_t& stream() const override { return *stream_; }
+  const gpuStream_t& stream() const override { return *stream_; }
 
+#ifdef PADDLE_WITH_HIP
+  const hipDeviceProp_t& deviceProperties() const override {
+#else
   const cudaDeviceProp& deviceProperties() const override {
+#endif
     return *device_prop_;
   }
 
@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
       char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
 #endif
       semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+#endif
     }
     return semaphore_;
   }
 
  private:
   CUDAPlace place_;
-  const cudaStream_t* stream_;         // not owned;
+  const gpuStream_t* stream_;  // not owned;
+#ifdef PADDLE_WITH_HIP
+  const hipDeviceProp_t* device_prop_;
+#else
   const cudaDeviceProp* device_prop_;  // not owned;
+#endif
   mutable void* scratch_;
   mutable unsigned int* semaphore_;
   mutable std::mutex mtx_;  // to protect allocations_
@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
   InitEigenContext();
   InitCuBlasContext();
   InitCuDNNContext();
+#ifndef PADDLE_WITH_HIP
   InitCuSolverContext();
+#endif
 }
 
 CUDAContext::~CUDAContext() {
   CUDADeviceGuard guard(place_.device);
   DestoryCuDNNContext();
   DestoryCuBlasContext();
+#ifndef PADDLE_WITH_HIP
   DestoryCuSolverContext();
+#endif
 }
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
                           << ", Runtime API Version: "
                           << runtime_version_ / 1000 << "."
                           << (runtime_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
+                          << ", MIOpen Version: " << version_major << "."
+                          << version_minor << "." << version_patch;
+#else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
                           << (cudnn_dso_ver % 1000) / 100 << ".";
-
+#endif
   {
     // Check CUDA/CUDNN version compatiblity
     auto local_cuda_version =
         (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
+#else
     auto compile_cuda_version =
         (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+#endif
     if (local_cuda_version < compile_cuda_version) {
       LOG_FIRST_N(WARNING, 1)
           << "WARNING: device: " << place_.device
@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
 
 CUDADeviceContext::~CUDADeviceContext() {
   SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nccl_comm_) {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
   }
@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
   return max_grid_dim_size_;
 }
 
+#ifdef PADDLE_WITH_HIP
+miopenHandle_t CUDADeviceContext::cudnn_handle() const {
+#else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+#endif
   return context()->CudnnHandle();
 }
 
@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
   return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }
 
+#ifndef PADDLE_WITH_HIP
 cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
   return context()->CusolverDnHandle();
 }
+#endif
 
-cudaStream_t CUDADeviceContext::stream() const {
-  return context()->RawStream();
-}
+gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }
 
 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e37a5e18e013605e66ab7d287e209a90c190eb06..72138b7909117736f06fc4827b63b1895a43d72e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -30,6 +30,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/cuda_helper.h"  // NOLINT
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
+#include "paddle/fluid/platform/gpu_info.h"  // NOLINT
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -44,7 +54,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -62,7 +72,7 @@ struct GpuDevice;
 namespace paddle {
 namespace platform {
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /*Set the value of the global variable allow_tf32_cublas*/
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
@@ -179,13 +189,19 @@ class CUDAContext {
 
   const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }
 
-  const cudaStream_t& RawStream() { return stream_->raw_stream(); }
+  const gpuStream_t& RawStream() { return stream_->raw_stream(); }
 
+#ifdef PADDLE_WITH_HIP
+  const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; }
+#else
   const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
+#endif
 
+#ifndef PADDLE_WITH_HIP
   const cusolverDnHandle_t& CusolverDnHandle() const {
     return cusolver_dn_handle_;
   }
+#endif
 
   const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
     return cublas_handle_;
@@ -222,6 +238,11 @@ class CUDAContext {
  private:
   void InitEigenContext();
 
+#ifdef PADDLE_WITH_HIP
+  void InitCuBlasContext() {
+    cublas_handle_.reset(new CublasHandleHolder(RawStream()));
+  }
+#else
   void InitCuBlasContext() {
     cublas_handle_.reset(
         new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
@@ -236,9 +257,32 @@ class CUDAContext {
 #endif  // CUDA_VERSION >= 9000
     }
   }
+#endif
 
   void InitCuDNNContext() {
     if (dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+      size_t miopen_major, miopen_minor, miopen_patch;
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+          &miopen_major, &miopen_minor, &miopen_patch));
+      auto local_miopen_version =
+          (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
+      auto compile_miopen_version = MIOPEN_VERSION / 100;
+      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with MIOPEN "
+            << compile_miopen_version / 10 << "." << compile_miopen_version % 10
+            << ", but MIOPEN version in your machine is "
+            << local_miopen_version / 10 << "." << local_miopen_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible MIOPEN "
+               "version.";
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::miopenSetStream(cudnn_handle_, RawStream()));
+#else
       auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
       auto compile_cudnn_version = CUDNN_VERSION / 100;
       if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
@@ -255,20 +299,27 @@ class CUDAContext {
       PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
       PADDLE_RETRY_CUDA_SUCCESS(
           dynload::cudnnSetStream(cudnn_handle_, RawStream()));
+#endif
     } else {
       cudnn_handle_ = nullptr;
     }
   }
 
+#ifndef PADDLE_WITH_HIP
   void InitCuSolverContext() {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
     PADDLE_RETRY_CUDA_SUCCESS(
         dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
   }
+#endif
 
   void DestoryCuDNNContext() {
     if (cudnn_handle_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
+#endif
     }
     cudnn_handle_ = nullptr;
   }
@@ -279,22 +330,30 @@ class CUDAContext {
     cublas_tf32_tensor_core_handle_.reset();
   }
 
+#ifndef PADDLE_WITH_HIP
   void DestoryCuSolverContext() {
     if (cusolver_dn_handle_) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           dynload::cusolverDnDestroy(cusolver_dn_handle_));
     }
   }
+#endif
 
   CUDAPlace place_;
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
   std::unique_ptr<stream::CUDAStream> stream_;
+#ifdef PADDLE_WITH_HIP
+  miopenHandle_t cudnn_handle_;
+#else
   cudnnHandle_t cudnn_handle_;
+#endif
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
+#ifndef PADDLE_WITH_HIP
   cusolverDnHandle_t cusolver_dn_handle_;
+#endif
   DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };
 
@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
     return context()->TensorCoreCublasCallIfAvailable(callback);
   }
 
-  /*! \brief  Return cudnn  handle in the device context. */
+/*! \brief  Return cudnn  handle in the device context. */
+#ifdef PADDLE_WITH_HIP
+  miopenHandle_t cudnn_handle() const;
+#else
   cudnnHandle_t cudnn_handle() const;
+#endif
 
   /*! \brief  Return a cudnn workspace handle to call multiple cudnn
    *  functions without interrupting by other threads.
@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
    *  sequential cudnn function calls. */
   CudnnWorkspaceHandle cudnn_workspace_handle() const;
 
+#ifndef PADDLE_WITH_HIP
   cusolverDnHandle_t cusolver_dn_handle() const;
+#endif
 
   /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t stream() const;
+  gpuStream_t stream() const;
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   /*! \brief  Return nccl communicators. */
   ncclComm_t nccl_comm() const { return nccl_comm_; }
 
@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
 #endif
 
   template <typename Callback>
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(gpuEvent_t ev, Callback callback) const {
     return context()->Stream()->RecordEvent(ev, callback);
   }
 
@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {
 
   mutable std::mutex cudnn_handle_mtx_;
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 5b3aa98efb46b51d6c3edb6d2cbd4200bd0a35c6..857d5d276516059acace1727b86f8d837e279cd3 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
     CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
     ASSERT_NE(nullptr, gpu_device);
+#ifdef PADDLE_WITH_HIP
+    miopenHandle_t cudnn_handle = device_context->cudnn_handle();
+#else
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+#endif
     ASSERT_NE(nullptr, cudnn_handle);
     delete device_context;
   }
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d873ac619f347b92deaf956e1b0765591b2a406d..47ade89ff2df3f53e3d99fbe1b203314b9edabd2 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -42,8 +42,7 @@ limitations under the License. */
 #include <miopen/miopen.h>
 #include <rocblas.h>
 #include <thrust/system/hip/error.h>
-#include <thrust/system_error.h>                  // NOLINT
-#include "paddle/fluid/platform/cuda_error.pb.h"  // NOLINT
+#include <thrust/system_error.h>  // NOLINT
 #endif
 
 #include <fstream>
@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
 inline bool is_error(hipError_t e) { return e != hipSuccess; }
 
 inline std::string build_rocm_error_msg(hipError_t e) {
-#if defined(PADDLE_WITH_HIP)
-  int32_t cuda_version = 100;
-#else
-  int32_t cuda_version = -1;
-#endif
   std::ostringstream sout;
   sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
   return sout.str();
diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/miopen_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..68db32bac103bda29a89a7b74765f549b34b5519
--- /dev/null
+++ b/paddle/fluid/platform/miopen_desc.h
@@ -0,0 +1,221 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/miopen_helper.h"
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace platform {
+using framework::Tensor;
+
+template <typename T>
+inline miopenDataType_t ToMIOpenDataType(const T& t) {
+  auto type = framework::ToDataType(t);
+  return ToMIOpenDataType(type);
+}
+
+inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
+  std::vector<int> transformed_dims(dims.begin(), dims.end());
+  int H, W, D, C;
+  if (dims.size() == 4) {
+    H = dims[1];
+    W = dims[2];
+    C = dims[3];
+    transformed_dims[1] = C;
+    transformed_dims[2] = H;
+    transformed_dims[3] = W;
+  } else {
+    D = dims[1];
+    H = dims[2];
+    W = dims[3];
+    C = dims[4];
+    transformed_dims[1] = C;
+    transformed_dims[2] = D;
+    transformed_dims[3] = H;
+    transformed_dims[4] = W;
+  }
+  return transformed_dims;
+}
+
+template <>
+inline miopenDataType_t ToMIOpenDataType(
+    const framework::proto::VarType::Type& t) {
+  miopenDataType_t type = miopenFloat;
+  switch (t) {
+    case framework::proto::VarType::FP16:
+      type = miopenHalf;
+      break;
+    case framework::proto::VarType::FP32:
+      type = miopenFloat;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+class ActivationDescriptor {
+ public:
+  ActivationDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateActivationDescriptor(&desc_));
+  }
+  ~ActivationDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyActivationDescriptor(desc_));
+  }
+  template <typename T>
+  void set(miopenActivationMode_t mode, const T& coef) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+        desc_, mode, static_cast<double>(coef), 0.0, 0.0));
+  }
+
+  miopenActivationDescriptor_t desc() { return desc_; }
+  miopenActivationDescriptor_t desc() const { return desc_; }
+
+ private:
+  miopenActivationDescriptor_t desc_;
+};
+
+class TensorDescriptor {
+ public:
+  TensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~TensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+  miopenTensorDescriptor_t desc() { return desc_; }
+  miopenTensorDescriptor_t desc() const { return desc_; }
+
+  void set(const Tensor& tensor, const int groups = 1) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+  void set(const Tensor& tensor, const miopenTensorFormat_t format) {
+    const int groups = 1;
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+};
+
+class FilterDescriptor {
+ public:
+  FilterDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~FilterDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+  miopenTensorDescriptor_t desc() { return desc_; }
+  miopenTensorDescriptor_t desc() const { return desc_; }
+
+  void set(const Tensor& tensor, const miopenTensorFormat_t format,
+           const int groups = 1) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> transformed_dims;
+    PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
+                      platform::errors::InvalidArgument(
+                          "format should ONLY be NCHW in MIOPEN."));
+    transformed_dims = dims;
+    if (groups > 1) {
+      transformed_dims[1] = transformed_dims[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(transformed_dims.size()),
+        const_cast<int*>(transformed_dims.data()), nullptr));
+  }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+};
+
+class ConvolutionDescriptor {
+ public:
+  ConvolutionDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateConvolutionDescriptor(&desc_));
+  }
+  ~ConvolutionDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyConvolutionDescriptor(desc_));
+  }
+  miopenConvolutionDescriptor_t desc() { return desc_; }
+  miopenConvolutionDescriptor_t desc() const { return desc_; }
+
+  void set(miopenDataType_t dtype, const std::vector<int>& pads,
+           const std::vector<int>& strides, const std::vector<int>& dilations,
+           bool allow_tf32, const int groups = 1) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+        desc_, static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
+        miopenConvolution));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenSetConvolutionGroupCount(desc_, groups));
+  }
+
+ private:
+  miopenConvolutionDescriptor_t desc_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm
index 2f624b2d9784b60c8aa544b17da884deca9d9549..6ae6b8963b7f5344856a505beed5c879b36925f8 100644
--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
@@ -1,29 +1,18 @@
 # A image for building paddle binaries
 # Use rocm-terminal base image for both rocm environment
 # When you modify it, please be aware of rocm version
-#
-# Build: ROCM 3.5.1
-# cd Paddle/tools/dockerfile
-# docker build -f Dockerfile.rocm \
-#        --build-arg ROCM_VERSION=3.5.1 \
-#        --build-arg CENTOS_VERSION=7.7.1908 \
-#        -t paddlepaddle/paddle-centos-rocm35-dev:latest .
 # 
-# Build: ROCM 3.9.1
+# Build: ROCM 3.9
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=3.9.1  \
-#        --build-arg CENTOS_VERSION=7.8.2003  \
+#        --build-arg ROCM_VERSION=3.9  \
 #        -t paddlepaddle/paddle-centos-rocm39-dev:latest .
 # 
-# Run: ROCM 3.5.1
 # docker run -it --device=/dev/kfd --device=/dev/dri \
 # --security-opt seccomp=unconfined --group-add video \
-# paddlepaddle/paddle-centos-rocm35-dev:latest /bin/bash
+# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash
 
-ARG CENTOS_VERSION
-FROM centos:${CENTOS_VERSION}
-ARG CENTOS_VERSION
+FROM centos:7.8.2003
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ENV LC_ALL en_US.UTF-8
@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
         zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
         make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
 
-# Install devtoolset-7 for ROCM 3.5/3.9
+# Install devtoolset-7
 RUN yum install -y yum-utils centos-release-scl && \
     yum-config-manager --enable rhel-server-rhscl-7-rpms && \
     yum-config-manager --enable rhel-7-server-rpms && \
@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
 ENV HIP_PATH=/opt/rocm/hip
 ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
 ENV PATH=/opt/rocm/bin:$PATH
-ENV PATH=/opt/rocm/hcc/bin:$PATH
-ENV PATH=/opt/rocm/hip/bin:$PATH
 ENV PATH=/opt/rocm/opencl/bin:$PATH
-ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH
 
 # git 2.17.1
 RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
     cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9
 
+# configure ssh
+RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
+    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
+
+RUN ssh-keygen -A
+
 EXPOSE 22