[ROCM] update fluid platform for rocm39 (part3), test=develop (#30913)

93c1d9e7 · Qi Li · GitHub · 15297a06 · 93c1d9e7 · 93c1d9e7
19 changed file
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -52,7 +52,12 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

-nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+IF(WITH_GPU)
+    nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
+ENDIF()

 cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
@@ -72,7 +77,7 @@ IF(WITH_DGC)
    set(dgc_deps dgc)
 ENDIF()

-IF(WITH_GPU)
+IF(WITH_GPU OR WITH_ROCM)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()

@@ -81,9 +86,14 @@ IF(WITH_MKLDNN)
 ELSE()
    set(MKLDNN_CTX_DEPS)
 ENDIF()
-
-nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
 IF(WITH_GPU)
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+ENDIF()
+IF(WITH_ROCM)
+    hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
+ENDIF()
+
+IF(WITH_GPU OR WITH_ROCM)
  set(STREAM_CALLBACK_DEPS stream_callback_manager)
 ELSE()
  set(STREAM_CALLBACK_DEPS)
@@ -103,18 +113,26 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool

 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)

-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
    cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
    target_link_libraries(device_context cuda_resource_pool)
 endif()

-nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-
 cc_test(init_test SRCS init_test.cc DEPS device_context)

-nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+if(WITH_GPU)
+  nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+  nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
+  nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()
+
+if(WITH_ROCM)
+  hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+  hip_test(miopen_helper_test SRCS miopen_helper_test.cc DEPS dynload_cuda)
+  hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda tensor)
+  hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+endif()

 cc_library(timer SRCS timer.cc)
 cc_test(timer_test SRCS timer_test.cc DEPS timer)
@@ -127,25 +145,34 @@ if(WITH_GPU)
  nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
  nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+elseif(WITH_ROCM)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
+  hip_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
  cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
  cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()

 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-
-nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-
 cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)

-nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+IF(WITH_GPU)
+  nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()

-nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+IF(WITH_ROCM)
+  hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
+  hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
+ENDIF()

 if(NOT APPLE AND NOT WIN32)
  cc_library(device_code SRCS device_code.cc DEPS device_context)
-  if(WITH_GPU)
+  if(WITH_GPU OR WITH_ROCM)
    cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
  endif()
 endif()
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -13,10 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/platform/collective_helper.h"
+#include <utility>

 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLCommImpl : public NCCLComm {
 public:
  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
@@ -35,7 +36,7 @@ class NCCLCommImpl : public NCCLComm {
  void set_comm(ncclComm_t comm) { comm_ = comm; }
  ncclComm_t comm() const override { return comm_; }

-  cudaStream_t stream() const override { return dev_ctx_->stream(); }
+  gpuStream_t stream() const override { return dev_ctx_->stream(); }

  void set_dev_ctx(std::unique_ptr<CUDADeviceContext>&& dev_ctx) {
    dev_ctx_ = std::move(dev_ctx);

--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -27,7 +27,7 @@
 namespace paddle {
 namespace platform {

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
 // ncclUniqueId. E.g. for a hierarchical case,
@@ -56,7 +56,7 @@ class NCCLComm {
  virtual int rank() const = 0;
  virtual int device_id() const = 0;
  virtual ncclComm_t comm() const = 0;
-  virtual cudaStream_t stream() const = 0;
+  virtual gpuStream_t stream() const = 0;
  virtual CUDADeviceContext* dev_context() const = 0;
  virtual ~NCCLComm() = default;
 };

--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,10 +14,8 @@ limitations under the License. */

 #pragma once

-#include <cuda.h>
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
-#include <cuda_fp16.h>
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -25,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace platform {

+#ifdef PADDLE_WITH_HIP
+#define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
+#else
 #if CUDA_VERSION < 9000
 #define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
 #else
@@ -32,6 +33,7 @@ namespace platform {
 #define CREATE_SHFL_MASK(mask, predicate) \
  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
+#endif

 inline static int RoundToPowerOfTwo(int dim) {
  if (dim > 512) {
@@ -67,7 +69,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                 int delta,
                                                 int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
  return __shfl_down(val, delta, width);
 #else
  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -77,7 +79,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                int width = warpSize) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
  return __shfl_xor(val, width);
 #else
  return __shfl_xor_sync(mask, val, width);
@@ -85,18 +87,27 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 }

 // CUDA 9.0 have native compatible float16 shfl_down
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                       float16 val, int delta,
                                                       int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_down(static_cast<float>(val),
+                             static_cast<unsigned>(delta), width));
+#else
  return float16(
      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+#endif
 }
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                      float16 val, int width) {
+#ifdef PADDLE_WITH_HIP
+  return float16(__shfl_xor(static_cast<float>(val), width));
+#else
  return float16(__shfl_xor(static_cast<half>(val), width));
+#endif
 }
 #else
 template <>
@@ -159,7 +170,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                             int width = 32) {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
  return __shfl(val, src_line, width);
 #else
  return __shfl_sync(mask, val, src_line, width);
@@ -173,13 +184,17 @@ HOSTDEVICE T Infinity() {

 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
+// NOTE(zcd): The warp size should be taken from the
+// parameters of the GPU but not specified as 32 simply.
+// To make the reduceSum more efficiently,
+// I use Warp-Level Parallelism and assume the Warp size
+// is 32 which may be different for different GPU,
+// but most card's warp size is 32.
+#ifdef PADDLE_WITH_HIP
+  const int warpSize = 64;
+#else
  const int warpSize = 32;
+#endif
  __shared__ T shm[warpSize];
  unsigned mask = 0u;
  CREATE_SHFL_MASK(mask, tid < len);

--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -16,11 +16,16 @@

 #include <mutex>  // NOLINT

+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
+#endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"

-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
 enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
 #endif

@@ -77,6 +82,12 @@ namespace platform {

 class CublasHandleHolder {
 public:
+#ifdef PADDLE_WITH_HIP
+  explicit CublasHandleHolder(hipStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_create_handle(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_set_stream(handle_, stream));
+  }
+#else
  CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
@@ -92,9 +103,14 @@ class CublasHandleHolder {
    }
 #endif  // CUDA_VERSION >= 9000
  }
+#endif

  ~CublasHandleHolder() PADDLE_MAY_THROW {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::rocblas_destroy_handle(handle_));
+#else
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
+#endif
  }

  template <typename Callback>
@@ -106,7 +122,11 @@ class CublasHandleHolder {
 private:
  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);

+#ifdef PADDLE_WITH_HIP
+  rocblas_handle handle_;
+#else
  cublasHandle_t handle_;
+#endif
  mutable std::mutex mtx_;
 };


--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -47,8 +47,13 @@ void TestCase(size_t num) {
  T *in1, *in2, *out;
  T *d_in1, *d_in2;
  size_t size = sizeof(T) * num;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), size);
+#else
  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+#endif
  in1 = reinterpret_cast<T*>(malloc(size));
  in2 = reinterpret_cast<T*>(malloc(size));
  out = reinterpret_cast<T*>(malloc(size));
@@ -58,12 +63,22 @@ void TestCase(size_t num) {
    in1[i] = static_cast<T>(dist(engine));
    in2[i] = static_cast<T>(dist(engine));
  }
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<T>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
+  hipDeviceSynchronize();
+  hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
  AddKernel<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
  cudaDeviceSynchronize();
  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
+#endif
  for (size_t i = 0; i < num; ++i) {
    // NOTE(dzhwinter): the float16 add has small underflow/overflow
    // so we use EXPECT_NEAR to check the result.
@@ -73,8 +88,13 @@ void TestCase(size_t num) {
  free(in1);
  free(in2);
  free(out);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
  cudaFree(d_in1);
  cudaFree(d_in2);
+#endif
 }

 // cuda primitives
@@ -103,8 +123,13 @@ void TestUnalign(size_t num, const int shift_bit) {
  size_t size = sizeof(uint8_t) * (num + shift_bit);
  size_t array_size = sizeof(float16) * (num / 2);

+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), size);
+#else
  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
  cudaMalloc(reinterpret_cast<void**>(&d_in2), size);
+#endif
  in1 = reinterpret_cast<float16*>(malloc(size));
  in2 = reinterpret_cast<float16*>(malloc(size));
  out = reinterpret_cast<float16*>(malloc(size));
@@ -121,12 +146,23 @@ void TestUnalign(size_t num, const int shift_bit) {
    r_in1[i] = static_cast<float16>(dist(engine));
    r_in2[i] = static_cast<float16>(dist(engine));
  }
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, r_in1, array_size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, r_in2, array_size, hipMemcpyHostToDevice);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(AddKernel<float16>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2,
+                     num / 2);
+  hipDeviceSynchronize();
+  hipMemcpy(out, d_in2, array_size, hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
  cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice);
  AddKernel<float16><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2);
  cudaDeviceSynchronize();
  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
+#endif
  for (size_t i = 0; i < num / 2; ++i) {
    // NOTE(dzhwinter): the float16 add has small truncate error.
    // so we use EXPECT_NEAR to check the result.
@@ -137,8 +173,13 @@ void TestUnalign(size_t num, const int shift_bit) {
  free(in1);
  free(in2);
  free(out);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
  cudaFree(d_in1);
  cudaFree(d_in2);
+#endif
 }

 TEST(CudaAtomic, float16Unalign) {
@@ -203,8 +244,13 @@ void TestReduce(size_t num, float atol = 0.01) {
  T* in1;
  T *d_in1, *d_in2;
  size_t size = sizeof(T) * num;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+#else
  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+#endif
  in1 = reinterpret_cast<T*>(malloc(size));
  std::minstd_rand engine;
  std::uniform_real_distribution<double> dist(0.0, 1.0);
@@ -212,17 +258,31 @@ void TestReduce(size_t num, float atol = 0.01) {
    in1[i] = static_cast<T>(dist(engine));
  }
  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipDeviceSynchronize();
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(DeviceReduceSum<T>), dim3(1),
+                     dim3(PADDLE_CUDA_NUM_THREADS), 0, 0, d_in1, d_in2, num);
+  hipMemcpy(in1, d_in2, sizeof(T), hipMemcpyDeviceToHost);
+  hipDeviceSynchronize();
+#else
  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();
  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
+#endif
  // NOTE(dzhwinter): the float16 add has small underflow/overflow
  // so we use EXPECT_NEAR to check the result.
  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
  free(in1);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
  cudaFree(d_in1);
  cudaFree(d_in2);
+#endif
 }

 TEST(CudaShuffleSync, float16) {

--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -13,7 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 #include <stdio.h>
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -50,7 +55,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
      static_cast<unsigned long long int>(val));            // NOLINT
 }

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
@@ -149,12 +154,12 @@ USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
  if (*address >= val) {
-    return;
+    return *address;
  }

  unsigned long long int old = *address, assumed;  // NOLINT
@@ -181,7 +186,7 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {

 CUDA_ATOMIC_WRAPPER(Max, float) {
  if (*address >= val) {
-    return;
+    return *address;
  }

  int *const address_as_i = reinterpret_cast<int *>(address);
@@ -199,7 +204,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) {

 CUDA_ATOMIC_WRAPPER(Max, double) {
  if (*address >= val) {
-    return;
+    return *address;
  }

  unsigned long long int *const address_as_ull =            // NOLINT
@@ -221,12 +226,12 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
  if (*address <= val) {
-    return;
+    return *address;
  }

  unsigned long long int old = *address, assumed;  // NOLINT
@@ -253,7 +258,7 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {

 CUDA_ATOMIC_WRAPPER(Min, float) {
  if (*address <= val) {
-    return;
+    return *address;
  }

  int *const address_as_i = reinterpret_cast<int *>(address);
@@ -271,7 +276,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) {

 CUDA_ATOMIC_WRAPPER(Min, double) {
  if (*address <= val) {
-    return;
+    return *address;
  }

  unsigned long long int *const address_as_ull =            // NOLINT

--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/gpu_info.h"

@@ -25,15 +25,24 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
    auto creator = [dev_idx] {
      platform::SetDeviceId(dev_idx);
-      cudaStream_t stream;
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+#endif
      return stream;
    };

-    auto deleter = [dev_idx](cudaStream_t stream) {
+    auto deleter = [dev_idx](gpuStream_t stream) {
      platform::SetDeviceId(dev_idx);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream));
+#endif
    };

    pool_.emplace_back(
@@ -65,15 +74,24 @@ CudaEventResourcePool::CudaEventResourcePool() {
  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
    auto creator = [dev_idx] {
      platform::SetDeviceId(dev_idx);
-      cudaEvent_t event;
+      gpuEvent_t event;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+#endif
      return event;
    };

-    auto deleter = [dev_idx](cudaEvent_t event) {
+    auto deleter = [dev_idx](gpuEvent_t event) {
      platform::SetDeviceId(dev_idx);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+#endif
    };

    pool_.emplace_back(ResourcePool<CudaEventObject>::Create(creator, deleter));

--- a/paddle/fluid/platform/cuda_resource_pool.h
+++ b/paddle/fluid/platform/cuda_resource_pool.h
@@ -14,9 +14,17 @@

 #pragma once

+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <memory>
 #include <type_traits>
 #include <vector>
@@ -26,8 +34,8 @@
 namespace paddle {
 namespace platform {

-using CudaStreamObject = std::remove_pointer<cudaStream_t>::type;
-using CudaEventObject = std::remove_pointer<cudaEvent_t>::type;
+using CudaStreamObject = std::remove_pointer<gpuStream_t>::type;
+using CudaEventObject = std::remove_pointer<gpuEvent_t>::type;

 class CudaStreamResourcePool {
 public:

--- a/paddle/fluid/platform/cudnn_desc_test.cc
+++ b/paddle/fluid/platform/cudnn_desc_test.cc
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_desc.h"
+#else
 #include "paddle/fluid/platform/cudnn_desc.h"
+#endif

 #include <gtest/gtest.h>


--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/device_code.h"
-
 #include <sys/stat.h>
+#include <algorithm>
 #include <set>
+#include <utility>

+#include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/enforce.h"

 DECLARE_string(cuda_dir);
@@ -71,26 +72,35 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
  }
  for (auto& p : set) {
    if (is_gpu_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      device_codes_.emplace(p, DeviceCodeMap());
 #else
      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "CUDAPlace is not supported, please re-compile with WITH_GPU=ON."));
+          "CUDAPlace or HIPPlace is not supported, please re-compile with "
+          "WITH_GPU=ON or WITH_ROCM=ON."));
 #endif
    }
  }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  CUDADeviceCode::CheckAvailableStatus();
 #endif
 }

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_HIP
+static bool CheckCUDADriverResult(hipError_t result, std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != hipSuccess) {
+    const char* error = nullptr;
+    error = dynload::hipGetErrorString(result);
+#else
 static bool CheckCUDADriverResult(CUresult result, std::string caller,
                                  std::string kernel_name = "") {
  if (result != CUDA_SUCCESS) {
    const char* error = nullptr;
    dynload::cuGetErrorString(result, &error);
+#endif
    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
                            << " > failed: " << error << " (" << result << ")";
    return false;
@@ -109,13 +119,23 @@ void CUDADeviceCode::CheckAvailableStatus() {

  int nvrtc_major = 0;
  int nvrtc_minor = 0;
+#ifdef PADDLE_WITH_HIP
+  hiprtcResult nvrtc_result =
+      dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
+#else
  nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
+#endif

  int driver_version = 0;
  int dirver_major = 0;
  int driver_minor = 0;
+#ifdef PADDLE_WITH_HIP
+  hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
+  if (driver_result == hipSuccess) {
+#else
  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
  if (driver_result == CUDA_SUCCESS) {
+#endif
    dirver_major = driver_version / 1000;
    driver_minor = (driver_version % 1000) / 10;
  }
@@ -123,13 +143,22 @@ void CUDADeviceCode::CheckAvailableStatus() {
  LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
                       << driver_minor << "; NVRTC Version: " << nvrtc_major
                       << "." << nvrtc_minor;
+#ifdef PADDLE_WITH_HIP
+  if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
+#else
  if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
+#endif
    return;
  }

  int count = 0;
+#ifdef PADDLE_WITH_HIP
+  if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
+                            "hipGetDeviceCount")) {
+#else
  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                            "cuDeviceGetCount")) {
+#endif
    available_ = true;
  }
 }
@@ -163,12 +192,18 @@ static std::string FindCUDAIncludePath() {
    }
  }

+#ifdef PADDLE_WITH_HIP
+  cuda_include_path = "/opt/rocm/include";
+#else
  cuda_include_path = "/usr/local/cuda/include";
+#endif
+
  if (stat(cuda_include_path.c_str(), &st) == 0) {
    return cuda_include_path;
  }
-  LOG(WARNING) << "Cannot find CUDA include path."
-               << "Please check whether CUDA is installed in the default "
+  LOG(WARNING)
+      << "Cannot find CUDA or ROCM include path."
+      << "Please check whether CUDA or ROCM is installed in the default "
         "installation path, or specify it by export "
         "FLAGS_cuda_dir=xxx.";
  return "";
@@ -183,7 +218,11 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,

  place_ = place;
  name_ = name;
+#ifdef PADDLE_WITH_HIP
+  kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
+#else
  kernel_ = kernel;
+#endif
 }

 bool CUDADeviceCode::Compile(bool include_path) {
@@ -193,7 +232,84 @@ bool CUDADeviceCode::Compile(bool include_path) {
        << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
    return false;
  }
+#ifdef PADDLE_WITH_HIP
+  hiprtcProgram program;
+  if (!CheckNVRTCResult(dynload::hiprtcCreateProgram(&program,
+                                                     kernel_.c_str(),  // buffer
+                                                     name_.c_str(),    // name
+                                                     0,         // numHeaders
+                                                     nullptr,   // headers
+                                                     nullptr),  // includeNames
+                        "hiprtcCreateProgram")) {
+    return false;
+  }

+  // Compile the program for specified compute_capability
+  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
+      DeviceContextPool::Instance().Get(place_));
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindCUDAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  hiprtcResult compile_result =
+      dynload::hiprtcCompileProgram(program,          // program
+                                    options.size(),   // numOptions
+                                    options.data());  // options
+  if (compile_result == HIPRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::hiprtcGetProgramLogSize(program, &log_size),
+                          "hiprtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::hiprtcGetProgramLog(program, log.data()),
+                          "hiprtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of ROCM GPU code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program for cuda
+  // Obtain Code from the program for hip
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::hiprtcGetCodeSize(program, &ptx_size),
+                        "hiprtcGetCodeSize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::hiprtcGetCode(program, ptx_.data()),
+                        "hiprtcGetCode")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::hiprtcDestroyProgram(&program),
+                        "hiprtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::hipModuleLoadData(&module_, ptx_.data()),
+                             "hipModuleLoadData")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::hipModuleGetFunction(&function_, module_, name_.c_str()),
+          "hipModuleGetFunction")) {
+    return false;
+  }
+#else
  nvrtcProgram program;
  if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
                                                    kernel_.c_str(),  // buffer
@@ -271,6 +387,7 @@ bool CUDADeviceCode::Compile(bool include_path) {
          "cuModuleGetFunction", name_)) {
    return false;
  }
+#endif

  max_threads_ = dev_ctx->GetMaxPhysicalThreadCount();
  is_compiled_ = true;
@@ -291,6 +408,18 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {

  auto* dev_ctx = reinterpret_cast<CUDADeviceContext*>(
      DeviceContextPool::Instance().Get(place_));
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_EQ(
+      dynload::hipModuleLaunchKernel(function_, num_blocks, 1, 1,  // grid dim
+                                     num_threads_, 1, 1,           // block dim
+                                     0,                  // shared memory
+                                     dev_ctx->stream(),  // stream
+                                     args->data(),       // arguments
+                                     nullptr),
+      hipSuccess,
+      errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
+                       name_.c_str()));
+#else
  PADDLE_ENFORCE_EQ(
      dynload::cuLaunchKernel(function_, num_blocks, 1, 1,  // grid dim
                              num_threads_, 1, 1,           // block dim
@@ -301,8 +430,19 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
      CUDA_SUCCESS,
      errors::External("Fail to launch kernel %s (in cuLaunchKernel.)",
                       name_.c_str()));
+#endif
 }

+#ifdef PADDLE_WITH_HIP
+bool CUDADeviceCode::CheckNVRTCResult(hiprtcResult result,
+                                      std::string function) {
+  if (result != HIPRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::hiprtcGetErrorString(result);
+    return false;
+  }
+#else
 bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
                                      std::string function) {
  if (result != NVRTC_SUCCESS) {
@@ -311,6 +451,7 @@ bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
        << " > failed: " << dynload::nvrtcGetErrorString(result);
    return false;
  }
+#endif
  return true;
 }
 #endif

--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -25,6 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #include "paddle/fluid/platform/dynload/nvrtc.h"
 #endif
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/hiprtc.h"
+#include "paddle/fluid/platform/dynload/rocm_driver.h"
+#endif

 namespace paddle {
 namespace platform {
@@ -44,7 +48,7 @@ class DeviceCode {
  std::string kernel_;
 };

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CUDADeviceCode : public DeviceCode {
 public:
  explicit CUDADeviceCode(const Place& place, const std::string& name,
@@ -61,7 +65,11 @@ class CUDADeviceCode : public DeviceCode {
  static bool IsAvailable() { return available_; }

 private:
+#ifdef PADDLE_WITH_HIP
+  bool CheckNVRTCResult(hiprtcResult result, std::string function);
+#else
  bool CheckNVRTCResult(nvrtcResult result, std::string function);
+#endif

  static bool available_;

@@ -70,8 +78,13 @@ class CUDADeviceCode : public DeviceCode {
  int num_threads_{1024};
  int workload_per_thread_{1};
  std::vector<char> ptx_;
+#ifdef PADDLE_WITH_HIP
+  hipModule_t module_;
+  hipFunction_t function_;
+#else
  CUmodule module_;
  CUfunction function_;
+#endif
 };
 #endif


--- a/paddle/fluid/platform/device_code_test.cc
+++ b/paddle/fluid/platform/device_code_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/device_code.h"
+#include <utility>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/init.h"

+#ifdef PADDLE_WITH_CUDA
 constexpr auto saxpy_code = R"(
 extern "C" __global__
 void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
@@ -26,8 +28,22 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
  }
 }
 )";
+#endif

-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+constexpr auto saxpy_code = R"(
+#include <hip/hip_runtime.h>
+extern "C" __global__
+void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
+  for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n;
+       tid += blockDim.x * gridDim.x) {
+    z[tid] = a * x[tid] + y[tid];
+  }
+}
+)";
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(DeviceCode, cuda) {
  if (!paddle::platform::dynload::HasNVRTC() ||
      !paddle::platform::dynload::HasCUDADriver()) {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,7 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -29,7 +29,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
  }

  if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
    auto& desired_dev_ctx =
@@ -65,7 +65,7 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
 namespace paddle {
 namespace platform {

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
@@ -122,7 +122,7 @@ DeviceContextPool::DeviceContextPool(
      EmplaceDeviceContext<CPUDeviceContext, CPUPlace>(&device_contexts_, p);
 #endif
    } else if (platform::is_gpu_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
      PADDLE_THROW(
@@ -130,7 +130,7 @@ DeviceContextPool::DeviceContextPool(
                                          "re-compile with WITH_GPU option."));
 #endif
    } else if (platform::is_cuda_pinned_place(p)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
          &device_contexts_, p);
 #else
@@ -229,7 +229,7 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 class EigenCudaStreamDevice : public Eigen::StreamInterface {
 public:
@@ -238,15 +238,19 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
  }
  ~EigenCudaStreamDevice() override {}

-  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
+  void Reinitialize(const gpuStream_t* cuda_stream, CUDAPlace place) {
    stream_ = cuda_stream;
    place_ = place;
    device_prop_ = &Eigen::m_deviceProperties[place.device];
  }

-  const cudaStream_t& stream() const override { return *stream_; }
+  const gpuStream_t& stream() const override { return *stream_; }

+#ifdef PADDLE_WITH_HIP
+  const hipDeviceProp_t& deviceProperties() const override {
+#else
  const cudaDeviceProp& deviceProperties() const override {
+#endif
    return *device_prop_;
  }

@@ -295,16 +299,25 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
 #endif
      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(
          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+#endif
    }
    return semaphore_;
  }

 private:
  CUDAPlace place_;
-  const cudaStream_t* stream_;         // not owned;
+  const gpuStream_t* stream_;  // not owned;
+#ifdef PADDLE_WITH_HIP
+  const hipDeviceProp_t* device_prop_;
+#else
  const cudaDeviceProp* device_prop_;  // not owned;
+#endif
  mutable void* scratch_;
  mutable unsigned int* semaphore_;
  mutable std::mutex mtx_;  // to protect allocations_
@@ -339,14 +352,18 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
  InitEigenContext();
  InitCuBlasContext();
  InitCuDNNContext();
+#ifndef PADDLE_WITH_HIP
  InitCuSolverContext();
+#endif
 }

 CUDAContext::~CUDAContext() {
  CUDADeviceGuard guard(place_.device);
  DestoryCuDNNContext();
  DestoryCuBlasContext();
+#ifndef PADDLE_WITH_HIP
  DestoryCuSolverContext();
+#endif
 }

 CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
@@ -369,17 +386,29 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
                          << ", Runtime API Version: "
                          << runtime_version_ / 1000 << "."
                          << (runtime_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
+                          << ", MIOpen Version: " << version_major << "."
+                          << version_minor << "." << version_patch;
+#else
  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
  LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
                          << (cudnn_dso_ver % 1000) / 100 << ".";
-
+#endif
  {
    // Check CUDA/CUDNN version compatiblity
    auto local_cuda_version =
        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+    auto compile_cuda_version = (HIP_VERSION / 100) * 10 + (HIP_VERSION % 10);
+#else
    auto compile_cuda_version =
        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+#endif
    if (local_cuda_version < compile_cuda_version) {
      LOG_FIRST_N(WARNING, 1)
          << "WARNING: device: " << place_.device
@@ -397,7 +426,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {

 CUDADeviceContext::~CUDADeviceContext() {
  SetDeviceId(place_.device);
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  if (nccl_comm_) {
    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
  }
@@ -434,7 +463,11 @@ dim3 CUDADeviceContext::GetCUDAMaxGridDimSize() const {
  return max_grid_dim_size_;
 }

+#ifdef PADDLE_WITH_HIP
+miopenHandle_t CUDADeviceContext::cudnn_handle() const {
+#else
 cudnnHandle_t CUDADeviceContext::cudnn_handle() const {
+#endif
  return context()->CudnnHandle();
 }

@@ -442,13 +475,13 @@ CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
 }

+#ifndef PADDLE_WITH_HIP
 cusolverDnHandle_t CUDADeviceContext::cusolver_dn_handle() const {
  return context()->CusolverDnHandle();
 }
+#endif

-cudaStream_t CUDADeviceContext::stream() const {
-  return context()->RawStream();
-}
+gpuStream_t CUDADeviceContext::stream() const { return context()->RawStream(); }

 CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
  eigen_device_.reset(new Eigen::DefaultDevice());

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -30,6 +30,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/cuda_helper.h"  // NOLINT
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/dynload/rocblas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
+#include "paddle/fluid/platform/gpu_info.h"  // NOLINT
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -44,7 +54,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -62,7 +72,7 @@ struct GpuDevice;
 namespace paddle {
 namespace platform {

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /*Set the value of the global variable allow_tf32_cublas*/
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
@@ -153,7 +163,7 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif

-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)

 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
@@ -179,13 +189,19 @@ class CUDAContext {

  const std::unique_ptr<stream::CUDAStream>& Stream() const { return stream_; }

-  const cudaStream_t& RawStream() { return stream_->raw_stream(); }
+  const gpuStream_t& RawStream() { return stream_->raw_stream(); }

+#ifdef PADDLE_WITH_HIP
+  const miopenHandle_t& CudnnHandle() const { return cudnn_handle_; }
+#else
  const cudnnHandle_t& CudnnHandle() const { return cudnn_handle_; }
+#endif

+#ifndef PADDLE_WITH_HIP
  const cusolverDnHandle_t& CusolverDnHandle() const {
    return cusolver_dn_handle_;
  }
+#endif

  const std::unique_ptr<CublasHandleHolder>& CublasHandle() const {
    return cublas_handle_;
@@ -222,6 +238,11 @@ class CUDAContext {
 private:
  void InitEigenContext();

+#ifdef PADDLE_WITH_HIP
+  void InitCuBlasContext() {
+    cublas_handle_.reset(new CublasHandleHolder(RawStream()));
+  }
+#else
  void InitCuBlasContext() {
    cublas_handle_.reset(
        new CublasHandleHolder(RawStream(), CUBLAS_DEFAULT_MATH));
@@ -236,9 +257,32 @@ class CUDAContext {
 #endif  // CUDA_VERSION >= 9000
    }
  }
+#endif

  void InitCuDNNContext() {
    if (dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+      size_t miopen_major, miopen_minor, miopen_patch;
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+          &miopen_major, &miopen_minor, &miopen_patch));
+      auto local_miopen_version =
+          (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
+      auto compile_miopen_version = MIOPEN_VERSION / 100;
+      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+        LOG_FIRST_N(WARNING, 1)
+            << "WARNING: device: " << place_.device
+            << ". The installed Paddle is compiled with MIOPEN "
+            << compile_miopen_version / 10 << "." << compile_miopen_version % 10
+            << ", but MIOPEN version in your machine is "
+            << local_miopen_version / 10 << "." << local_miopen_version % 10
+            << ", which may cause serious incompatible bug. "
+            << "Please recompile or reinstall Paddle with compatible MIOPEN "
+               "version.";
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreate(&cudnn_handle_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          dynload::miopenSetStream(cudnn_handle_, RawStream()));
+#else
      auto local_cudnn_version = dynload::cudnnGetVersion() / 100;
      auto compile_cudnn_version = CUDNN_VERSION / 100;
      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
@@ -255,20 +299,27 @@ class CUDAContext {
      PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
      PADDLE_RETRY_CUDA_SUCCESS(
          dynload::cudnnSetStream(cudnn_handle_, RawStream()));
+#endif
    } else {
      cudnn_handle_ = nullptr;
    }
  }

+#ifndef PADDLE_WITH_HIP
  void InitCuSolverContext() {
    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
    PADDLE_RETRY_CUDA_SUCCESS(
        dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
  }
+#endif

  void DestoryCuDNNContext() {
    if (cudnn_handle_) {
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroy(cudnn_handle_));
+#else
      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_));
+#endif
    }
    cudnn_handle_ = nullptr;
  }
@@ -279,22 +330,30 @@ class CUDAContext {
    cublas_tf32_tensor_core_handle_.reset();
  }

+#ifndef PADDLE_WITH_HIP
  void DestoryCuSolverContext() {
    if (cusolver_dn_handle_) {
      PADDLE_ENFORCE_CUDA_SUCCESS(
          dynload::cusolverDnDestroy(cusolver_dn_handle_));
    }
  }
+#endif

  CUDAPlace place_;
  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
  std::unique_ptr<stream::CUDAStream> stream_;
+#ifdef PADDLE_WITH_HIP
+  miopenHandle_t cudnn_handle_;
+#else
  cudnnHandle_t cudnn_handle_;
+#endif
  std::unique_ptr<CublasHandleHolder> cublas_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
  std::unique_ptr<CublasHandleHolder> cublas_tf32_tensor_core_handle_;
+#ifndef PADDLE_WITH_HIP
  cusolverDnHandle_t cusolver_dn_handle_;
+#endif
  DISABLE_COPY_AND_ASSIGN(CUDAContext);
 };

@@ -343,8 +402,12 @@ class CUDADeviceContext : public DeviceContext {
    return context()->TensorCoreCublasCallIfAvailable(callback);
  }

-  /*! \brief  Return cudnn  handle in the device context. */
+/*! \brief  Return cudnn  handle in the device context. */
+#ifdef PADDLE_WITH_HIP
+  miopenHandle_t cudnn_handle() const;
+#else
  cudnnHandle_t cudnn_handle() const;
+#endif

  /*! \brief  Return a cudnn workspace handle to call multiple cudnn
   *  functions without interrupting by other threads.
@@ -355,12 +418,14 @@ class CUDADeviceContext : public DeviceContext {
   *  sequential cudnn function calls. */
  CudnnWorkspaceHandle cudnn_workspace_handle() const;

+#ifndef PADDLE_WITH_HIP
  cusolverDnHandle_t cusolver_dn_handle() const;
+#endif

  /*! \brief  Return cuda stream in the device context. */
-  cudaStream_t stream() const;
+  gpuStream_t stream() const;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  /*! \brief  Return nccl communicators. */
  ncclComm_t nccl_comm() const { return nccl_comm_; }

@@ -369,7 +434,7 @@ class CUDADeviceContext : public DeviceContext {
 #endif

  template <typename Callback>
-  void RecordEvent(cudaEvent_t ev, Callback callback) const {
+  void RecordEvent(gpuEvent_t ev, Callback callback) const {
    return context()->Stream()->RecordEvent(ev, callback);
  }

@@ -411,7 +476,7 @@ class CUDADeviceContext : public DeviceContext {

  mutable std::mutex cudnn_handle_mtx_;

-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
  // NCCL communicator (single process version) for NCCL collective operations.
  // NCCL collective operations provides fast collectives over multiple GPUs
  // both within and across nodes.

--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -41,7 +41,11 @@ TEST(Device, CUDADeviceContext) {
    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
    ASSERT_NE(nullptr, gpu_device);
+#ifdef PADDLE_WITH_HIP
+    miopenHandle_t cudnn_handle = device_context->cudnn_handle();
+#else
    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+#endif
    ASSERT_NE(nullptr, cudnn_handle);
    delete device_context;
  }

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -43,7 +43,6 @@ limitations under the License. */
 #include <rocblas.h>
 #include <thrust/system/hip/error.h>
 #include <thrust/system_error.h>  // NOLINT
-#include "paddle/fluid/platform/cuda_error.pb.h"  // NOLINT
 #endif

 #include <fstream>
@@ -1034,11 +1033,6 @@ inline void retry_sleep(unsigned milliseconds) {
 inline bool is_error(hipError_t e) { return e != hipSuccess; }

 inline std::string build_rocm_error_msg(hipError_t e) {
-#if defined(PADDLE_WITH_HIP)
-  int32_t cuda_version = 100;
-#else
-  int32_t cuda_version = -1;
-#endif
  std::ostringstream sout;
  sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
  return sout.str();

--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/miopen_desc.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/miopen_helper.h"
+
+namespace paddle {
+namespace framework {
+class Tensor;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace platform {
+using framework::Tensor;
+
+template <typename T>
+inline miopenDataType_t ToMIOpenDataType(const T& t) {
+  auto type = framework::ToDataType(t);
+  return ToMIOpenDataType(type);
+}
+
+inline std::vector<int> TransformDimOrder(const std::vector<int>& dims) {
+  std::vector<int> transformed_dims(dims.begin(), dims.end());
+  int H, W, D, C;
+  if (dims.size() == 4) {
+    H = dims[1];
+    W = dims[2];
+    C = dims[3];
+    transformed_dims[1] = C;
+    transformed_dims[2] = H;
+    transformed_dims[3] = W;
+  } else {
+    D = dims[1];
+    H = dims[2];
+    W = dims[3];
+    C = dims[4];
+    transformed_dims[1] = C;
+    transformed_dims[2] = D;
+    transformed_dims[3] = H;
+    transformed_dims[4] = W;
+  }
+  return transformed_dims;
+}
+
+template <>
+inline miopenDataType_t ToMIOpenDataType(
+    const framework::proto::VarType::Type& t) {
+  miopenDataType_t type = miopenFloat;
+  switch (t) {
+    case framework::proto::VarType::FP16:
+      type = miopenHalf;
+      break;
+    case framework::proto::VarType::FP32:
+      type = miopenFloat;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+class ActivationDescriptor {
+ public:
+  ActivationDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateActivationDescriptor(&desc_));
+  }
+  ~ActivationDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyActivationDescriptor(desc_));
+  }
+  template <typename T>
+  void set(miopenActivationMode_t mode, const T& coef) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+        desc_, mode, static_cast<double>(coef), 0.0, 0.0));
+  }
+
+  miopenActivationDescriptor_t desc() { return desc_; }
+  miopenActivationDescriptor_t desc() const { return desc_; }
+
+ private:
+  miopenActivationDescriptor_t desc_;
+};
+
+class TensorDescriptor {
+ public:
+  TensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~TensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+  miopenTensorDescriptor_t desc() { return desc_; }
+  miopenTensorDescriptor_t desc() const { return desc_; }
+
+  void set(const Tensor& tensor, const int groups = 1) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+  void set(const Tensor& tensor, const miopenTensorFormat_t format) {
+    const int groups = 1;
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
+  }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+};
+
+class FilterDescriptor {
+ public:
+  FilterDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~FilterDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+  miopenTensorDescriptor_t desc() { return desc_; }
+  miopenTensorDescriptor_t desc() const { return desc_; }
+
+  void set(const Tensor& tensor, const miopenTensorFormat_t format,
+           const int groups = 1) {
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> transformed_dims;
+    PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
+                      platform::errors::InvalidArgument(
+                          "format should ONLY be NCHW in MIOPEN."));
+    transformed_dims = dims;
+    if (groups > 1) {
+      transformed_dims[1] = transformed_dims[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, ToMIOpenDataType(tensor.type()),
+        static_cast<int>(transformed_dims.size()),
+        const_cast<int*>(transformed_dims.data()), nullptr));
+  }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+};
+
+class ConvolutionDescriptor {
+ public:
+  ConvolutionDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateConvolutionDescriptor(&desc_));
+  }
+  ~ConvolutionDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyConvolutionDescriptor(desc_));
+  }
+  miopenConvolutionDescriptor_t desc() { return desc_; }
+  miopenConvolutionDescriptor_t desc() const { return desc_; }
+
+  void set(miopenDataType_t dtype, const std::vector<int>& pads,
+           const std::vector<int>& strides, const std::vector<int>& dilations,
+           bool allow_tf32, const int groups = 1) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+        desc_, static_cast<int>(pads.size()), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
+        miopenConvolution));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenSetConvolutionGroupCount(desc_, groups));
+  }
+
+ private:
+  miopenConvolutionDescriptor_t desc_;
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
@@ -2,28 +2,17 @@
 # Use rocm-terminal base image for both rocm environment
 # When you modify it, please be aware of rocm version
 # 
-# Build: ROCM 3.5.1
+# Build: ROCM 3.9
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=3.5.1 \
-#        --build-arg CENTOS_VERSION=7.7.1908 \
-#        -t paddlepaddle/paddle-centos-rocm35-dev:latest .
-# 
-# Build: ROCM 3.9.1
-# cd Paddle/tools/dockerfile
-# docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=3.9.1  \
-#        --build-arg CENTOS_VERSION=7.8.2003  \
+#        --build-arg ROCM_VERSION=3.9  \
 #        -t paddlepaddle/paddle-centos-rocm39-dev:latest .
 # 
-# Run: ROCM 3.5.1
 # docker run -it --device=/dev/kfd --device=/dev/dri \
 # --security-opt seccomp=unconfined --group-add video \
-# paddlepaddle/paddle-centos-rocm35-dev:latest /bin/bash
+# paddlepaddle/paddle-centos-rocm39-dev:latest /bin/bash

-ARG CENTOS_VERSION
-FROM centos:${CENTOS_VERSION}
-ARG CENTOS_VERSION
+FROM centos:7.8.2003
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>

 ENV LC_ALL en_US.UTF-8
@@ -34,7 +23,7 @@ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlit
        zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
        make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel

-# Install devtoolset-7 for ROCM 3.5/3.9
+# Install devtoolset-7
 RUN yum install -y yum-utils centos-release-scl && \
    yum-config-manager --enable rhel-server-rhscl-7-rpms && \
    yum-config-manager --enable rhel-7-server-rpms && \
@@ -70,10 +59,8 @@ ENV ROCM_PATH=/opt/rocm
 ENV HIP_PATH=/opt/rocm/hip
 ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
 ENV PATH=/opt/rocm/bin:$PATH
-ENV PATH=/opt/rocm/hcc/bin:$PATH
-ENV PATH=/opt/rocm/hip/bin:$PATH
 ENV PATH=/opt/rocm/opencl/bin:$PATH
-ENV PATH=/opt/rocm/llvm/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH

 # git 2.17.1
 RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
@@ -146,4 +133,12 @@ RUN cd /opt && wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache && \
    cd .. && rm -rf ccache-3.7.9.tar.gz && rm -rf ccache-3.7.9

+# configure ssh
+RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
+    sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config && \
+    sed -i "s/#UseDNS .*/UseDNS no/" /etc/ssh/sshd_config
+
+RUN ssh-keygen -A
+
 EXPOSE 22