From 334296306ca39eb452480a0a0e8d904de1b4a4b7 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 22 Feb 2021 20:03:48 +0800
Subject: [PATCH] [ROCM] update fluid platform for rocm39 (part4), test=develop
 (#30936)

---
 paddle/fluid/memory/allocation/CMakeLists.txt |  23 +-
 paddle/fluid/memory/malloc_test.cu            |  42 +-
 paddle/fluid/memory/memcpy.cc                 |  56 +-
 paddle/fluid/memory/pinned_memory_test.cu     |  60 +-
 .../fluid/platform/device_memory_aligment.cc  |   2 +-
 .../fluid/platform/device_memory_aligment.h   |   2 +-
 .../fluid/platform/dynload/dynamic_loader.cc  |   5 +-
 paddle/fluid/platform/dynload/hiprtc.h        |   1 +
 paddle/fluid/platform/dynload/miopen.h        |   5 +
 paddle/fluid/platform/dynload/rocm_driver.h   |   1 +
 paddle/fluid/platform/event.h                 |  11 +-
 paddle/fluid/platform/flags.cc                |  10 +-
 paddle/fluid/platform/for_range.h             |   2 +-
 paddle/fluid/platform/init.cc                 |  18 +-
 paddle/fluid/platform/init_test.cc            |   5 +-
 paddle/fluid/platform/miopen_helper.h         | 552 ++++++++++++++++++
 paddle/fluid/platform/miopen_helper_test.cc   |  93 +++
 17 files changed, 854 insertions(+), 34 deletions(-)
 create mode 100644 paddle/fluid/platform/miopen_helper.h
 create mode 100644 paddle/fluid/platform/miopen_helper_test.cc

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 108e1240c5..377ea37677 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -16,13 +16,20 @@ endif()
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
   nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+  nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+endif()
+
+if (WITH_ROCM)
+  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
+  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+  hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
-nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-if (WITH_GPU)
+if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
@@ -40,6 +47,16 @@ if (WITH_GPU)
                 cuda_allocator
                 device_context
                 memcpy)
+elseif (WITH_ROCM)
+    hip_test(best_fit_allocator_test
+            SRCS best_fit_allocator_test.cc
+                best_fit_allocator_test.cu
+            DEPS best_fit_allocator
+                locked_allocator
+                cpu_allocator
+                cuda_allocator
+                device_context
+                memcpy)
 else()
     cc_test(best_fit_allocator_test
             SRCS best_fit_allocator_test.cc
@@ -57,7 +74,7 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
-  if (WITH_GPU AND TARGET retry_allocator_test)
+  if ((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
     target_link_libraries(retry_allocator_test cuda_allocator)
   endif()
 
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index c9fbaf351e..d015ed7ce6 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -12,8 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
 #include <thread>  // NOLINT
 #include <vector>
 
@@ -40,8 +47,13 @@ __global__ void kernel(float *x, int n) {
 void CheckKernelOutput(float *x, int n) {
   auto host_x = std::unique_ptr<float[]>(new float[n]);
   for (int i = 0; i < n; ++i) {
+#ifdef PADDLE_WITH_HIP
+    EXPECT_TRUE(hipSuccess == hipMemcpy(host_x.get(), x, n * sizeof(float),
+                                        hipMemcpyDeviceToHost));
+#else
     EXPECT_TRUE(cudaSuccess == cudaMemcpy(host_x.get(), x, n * sizeof(float),
                                           cudaMemcpyDeviceToHost));
+#endif
     EXPECT_GE(host_x[i] + DELTA, 3.14159f * i);
     EXPECT_LE(host_x[i] - DELTA, 3.14159f * i);
   }
@@ -53,13 +65,22 @@ void MultiStreamCompute(float **data, float **second_data,
   AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float));
   EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
   *data = reinterpret_cast<float *>(allocation_ptr->ptr());
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, ctx.stream(), *data, N);
+#else
   kernel<<<1, 64, 0, ctx.stream()>>>(*data, N);
+#endif
 
   // allocate and compute on same stream again
   allocation_ptr = Alloc(ctx, N * sizeof(float));
   EXPECT_GE(allocation_ptr->size(), N * sizeof(float));
   *second_data = reinterpret_cast<float *>(allocation_ptr->ptr());
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, ctx.stream(), *second_data,
+                     N);
+#else
   kernel<<<1, 64, 0, ctx.stream()>>>(*second_data, N);
+#endif
 }
 
 TEST(Malloc, CUDADeviceContextMultiStream) {
@@ -75,8 +96,12 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
   float *second_data[NUM_STREAMS];
   CudaDevCtxVec dev_ctx;
 
-  // default stream
+// default stream
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, 0, main_stream_data, N);
+#else
   kernel<<<1, 64>>>(main_stream_data, N);
+#endif
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
@@ -85,7 +110,11 @@ TEST(Malloc, CUDADeviceContextMultiStream) {
     MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]);
   }
 
+#ifdef PADDLE_WITH_HIP
+  EXPECT_TRUE(hipSuccess == hipDeviceSynchronize());
+#else
   EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
+#endif
   for (int i = 0; i < NUM_STREAMS; ++i) {
     CheckKernelOutput(data[i], N);
     CheckKernelOutput(second_data[i], N);
@@ -106,8 +135,12 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   CudaDevCtxVec dev_ctx;
   std::vector<std::thread> threads;
 
-  // default stream
+// default stream
+#ifdef PADDLE_WITH_HIP
+  hipLaunchKernelGGL((kernel), dim3(1), dim3(64), 0, 0, main_stream_data, N);
+#else
   kernel<<<1, 64>>>(main_stream_data, N);
+#endif
   main_stream_alloc_ptr.reset();
 
   for (int i = 0; i < NUM_STREAMS; ++i) {
@@ -120,8 +153,11 @@ TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) {
   for (int i = 0; i < NUM_STREAMS; ++i) {
     threads[i].join();
   }
-
+#ifdef PADDLE_WITH_HIP
+  EXPECT_TRUE(hipSuccess == hipDeviceSynchronize());
+#else
   EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize());
+#endif
   for (int i = 0; i < NUM_STREAMS; ++i) {
     CheckKernelOutput(data[i], N);
     CheckKernelOutput(second_data[i], N);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index bd8371e474..6a1d44f6cf 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -196,9 +196,22 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
+#ifdef PADDLE_WITH_HIP
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  hipStreamSynchronize(0);
+#else
+  hipError_t e_sync = hipSuccess;
+  while (e_sync = hipStreamQuery(0)) {
+    if (e_sync == hipErrorNotReady) continue;
+    break;
+  }
+#endif
+}
+#else
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
   cudaStreamSynchronize(0);
@@ -210,6 +223,7 @@ inline void SyncCUDAStream() {
   }
 #endif
 }
+#endif
 
 // NOTE(zcd): Do not use GpuMemcpySync as much as possible.
 // because GpuMemcpySync issues the copying command to the default stream,
@@ -228,10 +242,18 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+#else
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+#endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+#endif
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
       SyncCUDAStream();
@@ -250,10 +272,18 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+#else
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+#endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+#endif
     // FIXME(zjl): do we really need it?
     if (num <= kMaxGpuAsyncCopyBytes) {
       SyncCUDAStream();
@@ -273,10 +303,18 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     platform::SetDeviceId(src_place.device);
     if (stream) {
       platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU");
+#ifdef PADDLE_WITH_HIP
+      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, stream);
+#else
       platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+#endif
     } else {
       platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU");
+#ifdef PADDLE_WITH_HIP
+      platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
+#else
       platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
+#endif
     }
   } else {
     if (stream) {
@@ -332,10 +370,18 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, stream);
+#else
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+#endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+#endif
   }
 }
 
@@ -351,10 +397,18 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
           << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, stream);
+#else
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+#endif
   } else {
     platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU");
+#ifdef PADDLE_WITH_HIP
+    platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+#endif
   }
 }
 
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
index 0d898f59ee..76a880755e 100644
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -41,27 +41,44 @@ float test_pinned_memory() {
   const int iteration = 10;
 
   // create event start and end
-  cudaEvent_t start_e, stop_e, copying_e;
+  gpuEvent_t start_e, stop_e, copying_e;
   float elapsedTime = 0;
+
+#ifdef PADDLE_WITH_HIP
+  hipEventCreate(&start_e);
+  hipEventCreate(&stop_e);
+  hipEventCreate(&copying_e);
+#else
   cudaEventCreate(&start_e);
   cudaEventCreate(&stop_e);
   cudaEventCreate(&copying_e);
+#endif
 
   // create computation stream, data copying stream
-  cudaStream_t computation_stream, copying_stream;
+  gpuStream_t computation_stream, copying_stream;
+#ifdef PADDLE_WITH_HIP
+  hipStreamCreate(&computation_stream);
+  hipStreamCreate(&copying_stream);
+#else
   cudaStreamCreate(&computation_stream);
   cudaStreamCreate(&copying_stream);
+#endif
 
   // create record event, pinned memory, gpu memory
-  std::vector<cudaEvent_t> record_event(iteration);
+  std::vector<gpuEvent_t> record_event(iteration);
   std::vector<float*> input_pinned_mem(iteration);
   std::vector<float*> gpu_mem(iteration);
   std::vector<float*> output_pinned_mem(iteration);
 
   // initial data
   for (int j = 0; j < iteration; ++j) {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreateWithFlags(&record_event[j], hipEventDisableTiming);
+    hipEventCreate(&(record_event[j]));
+#else
     cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
     cudaEventCreate(&(record_event[j]));
+#endif
     input_pinned_mem[j] = static_cast<float*>(
         paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
     output_pinned_mem[j] = static_cast<float*>(
@@ -74,7 +91,11 @@ float test_pinned_memory() {
     }
   }
 
+#ifdef PADDLE_WITH_HIP
+  hipEventRecord(start_e, computation_stream);
+#else
   cudaEventRecord(start_e, computation_stream);
+#endif
 
   // computation
   for (int m = 0; m < 30; ++m) {
@@ -88,13 +109,21 @@ float test_pinned_memory() {
       // call kernel on computation stream.
       Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
 
+#ifdef PADDLE_WITH_HIP
+      // record event_computation on computation stream
+      hipEventRecord(record_event[i], computation_stream);
+
+      // wait event_computation on copy stream.
+      // note: this operation is async.
+      hipStreamWaitEvent(copying_stream, record_event[i], 0);
+#else
       // record event_computation on computation stream
       cudaEventRecord(record_event[i], computation_stream);
 
       // wait event_computation on copy stream.
       // note: this operation is async.
       cudaStreamWaitEvent(copying_stream, record_event[i], 0);
-
+#endif
       // copy data GPU->CPU, on copy stream.
       // note: this operation is async for pinned memory.
       paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
@@ -103,6 +132,16 @@ float test_pinned_memory() {
     }
   }
 
+#ifdef PADDLE_WITH_HIP
+  hipEventRecord(copying_e, copying_stream);
+  hipStreamWaitEvent(computation_stream, copying_e, 0);
+
+  hipEventRecord(stop_e, computation_stream);
+
+  hipEventSynchronize(start_e);
+  hipEventSynchronize(stop_e);
+  hipEventElapsedTime(&elapsedTime, start_e, stop_e);
+#else
   cudaEventRecord(copying_e, copying_stream);
   cudaStreamWaitEvent(computation_stream, copying_e, 0);
 
@@ -111,6 +150,7 @@ float test_pinned_memory() {
   cudaEventSynchronize(start_e);
   cudaEventSynchronize(stop_e);
   cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
+#endif
 
   // std::cout << cpu_place << " "
   //          << "time consume:" << elapsedTime / 30 << std::endl;
@@ -123,12 +163,22 @@ float test_pinned_memory() {
     }
   }
 
-  // destroy resource
+// destroy resource
+#ifdef PADDLE_WITH_HIP
+  hipEventDestroy(copying_e);
+  hipEventDestroy(start_e);
+  hipEventDestroy(stop_e);
+#else
   cudaEventDestroy(copying_e);
   cudaEventDestroy(start_e);
   cudaEventDestroy(stop_e);
+#endif
   for (int j = 0; j < 10; ++j) {
+#ifdef PADDLE_WITH_HIP
+    hipEventDestroy((record_event[j]));
+#else
     cudaEventDestroy((record_event[j]));
+#endif
     paddle::memory::Free(cpu_place, input_pinned_mem[j]);
     paddle::memory::Free(cpu_place, output_pinned_mem[j]);
     paddle::memory::Free(cuda_place, gpu_mem[j]);
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index 8b57de9349..b287d11a9f 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -21,7 +21,7 @@ size_t Alignment(size_t size, const platform::Place &place) {
   if (platform::is_cpu_place(place)) {
     alignment = CpuMinChunkSize();
   } else {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     alignment = GpuMinChunkSize();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index 5cc33fd31f..a151e43483 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 6669d18f75..fbdfc4928c 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 #include <string>
+#include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -337,7 +338,7 @@ void* GetNVRTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so", false);
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
@@ -347,7 +348,7 @@ void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so", false);
+  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
diff --git a/paddle/fluid/platform/dynload/hiprtc.h b/paddle/fluid/platform/dynload/hiprtc.h
index 7cc58489fa..4b376f1858 100644
--- a/paddle/fluid/platform/dynload/hiprtc.h
+++ b/paddle/fluid/platform/dynload/hiprtc.h
@@ -45,6 +45,7 @@ extern bool HasNVRTC();
  * include all needed hiprtc functions
  **/
 #define HIPRTC_ROUTINE_EACH(__macro) \
+  __macro(hiprtcVersion);            \
   __macro(hiprtcGetErrorString);     \
   __macro(hiprtcCompileProgram);     \
   __macro(hiprtcCreateProgram);      \
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 57fec91ffb..43a3e1a107 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -16,10 +16,15 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include <miopen/miopen.h>
+#include <miopen/version.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
+#define MIOPEN_VERSION                                        \
+  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 100 + \
+   MIOPEN_VERSION_PATCH)  // NOLINT
+
 namespace paddle {
 namespace platform {
 namespace dynload {
diff --git a/paddle/fluid/platform/dynload/rocm_driver.h b/paddle/fluid/platform/dynload/rocm_driver.h
index 7633e84c85..4527b6d6e4 100644
--- a/paddle/fluid/platform/dynload/rocm_driver.h
+++ b/paddle/fluid/platform/dynload/rocm_driver.h
@@ -46,6 +46,7 @@ extern bool HasCUDADriver();
  * include all needed cuda driver functions
  **/
 #define ROCM_ROUTINE_EACH(__macro)                            \
+  __macro(hipDriverGetVersion);                               \
   __macro(hipGetErrorString);                                 \
   __macro(hipModuleLoadData);                                 \
   __macro(hipModuleGetFunction);                              \
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 9a482a63f5..0985b884d1 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -18,6 +18,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -48,9 +51,9 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifndef PADDLE_WITH_CUPTI
-  cudaEvent_t event() const { return event_; }
+  gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
 #endif
 #endif
@@ -66,7 +69,7 @@ class Event {
   EventRole role_{};
   int64_t cpu_ns_;
   bool visited_status_{false};
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -77,7 +80,7 @@ class Event {
 
  private:
 #else
-  cudaEvent_t event_ = nullptr;
+  gpuEvent_t event_ = nullptr;
   int device_ = -1;
 #endif
 #endif
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 20be80b176..1a55562f2b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "gflags/gflags.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 #endif
 
@@ -45,7 +45,7 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * CUDA related related FLAG
@@ -84,7 +84,7 @@ DEFINE_string(selected_gpus, "",
               "share-memory only.");
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * CUDNN related FLAG
@@ -167,7 +167,7 @@ DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
             "batch_norm, default is False.");
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * NCCL related FLAG
@@ -377,7 +377,7 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * Memory related FLAG
diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h
index c153e80fe4..1869f3e2f7 100644
--- a/paddle/fluid/platform/for_range.h
+++ b/paddle/fluid/platform/for_range.h
@@ -40,7 +40,7 @@ struct ForRange<CPUDeviceContext> {
   size_t limit_;
 };
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename Function>
 __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
   size_t idx = static_cast<size_t>(threadIdx.x);
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 7e5bce29bc..ea89082733 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cupti.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
@@ -92,6 +94,7 @@ bool InitGflags(std::vector<std::string> args) {
   return successed;
 }
 
+#ifdef PADDLE_WITH_CUDA
 void InitCupti() {
 #ifdef PADDLE_WITH_CUPTI
   if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
@@ -117,14 +120,17 @@ void InitCupti() {
 #undef MULTIPLY_ATTR_VALUE
 #endif
 }
+#endif
 
 void InitDevices() {
-  // CUPTI attribute should be set before any CUDA context is created (see CUPTI
-  // documentation about CUpti_ActivityAttribute).
+// CUPTI attribute should be set before any CUDA context is created (see CUPTI
+// documentation about CUpti_ActivityAttribute).
+#ifdef PADDLE_WITH_CUDA
   InitCupti();
+#endif
   /*Init all available devices by default */
   std::vector<int> devices;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   try {
     // use user specified GPUs in single-node multi-process mode.
     devices = platform::GetSelectedDevices();
@@ -154,7 +160,7 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     places.emplace_back(platform::CUDAPlace(devices[i]));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -162,7 +168,7 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
   platform::DeviceContextPool::Init(places);
diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc
index b6ede497a3..965fe7b6db 100644
--- a/paddle/fluid/platform/init_test.cc
+++ b/paddle/fluid/platform/init_test.cc
@@ -19,7 +19,8 @@ TEST(InitDevices, CPU) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \
+    !defined(PADDLE_WITH_HIP)
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
@@ -30,7 +31,7 @@ TEST(InitDevices, CUDA) {
   using paddle::framework::InitDevices;
   using paddle::platform::DeviceContextPool;
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int count = paddle::platform::GetCUDADeviceCount();
   InitDevices();
   DeviceContextPool& pool = DeviceContextPool::Instance();
diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/miopen_helper.h
new file mode 100644
index 0000000000..f604513085
--- /dev/null
+++ b/paddle/fluid/platform/miopen_helper.h
@@ -0,0 +1,552 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/dynload/miopen.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/macros.h"
+
+// MIOPEN do not have epslion definition
+#define CUDNN_BN_MIN_EPSILON 1e-05
+
+namespace paddle {
+namespace platform {
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+DECLARE_bool(cudnn_deterministic);
+
+namespace paddle {
+namespace platform {
+
+// MIOPEN only support NCHW, just for compatibility with CUDNN API
+typedef enum {
+  MIOPEN_TENSOR_NCHW = 0,
+  MIOPEN_TENSOR_NHWC = 1,
+} miopenTensorFormat_t;
+
+// MIOPEN do not support indirect function call defined in cudnnWorkspaceHandle
+struct miopenWorkspace {
+  explicit miopenWorkspace(size_t size) : size(size), data(NULL) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&data, size));
+  }
+  miopenWorkspace(const miopenWorkspace&) = delete;
+  miopenWorkspace(miopenWorkspace&&) = default;
+  miopenWorkspace& operator=(miopenWorkspace&&) = default;
+  ~miopenWorkspace() {
+    if (data) {
+      hipFree(data);
+    }
+  }
+  size_t size;
+  void* data;
+};
+
+inline const char* miopenGetErrorString(miopenStatus_t status) {
+  switch (status) {
+    case miopenStatusSuccess:
+      return "miopenStatusSuccess";
+    case miopenStatusNotInitialized:
+      return "miopenStatusNotInitialized";
+    case miopenStatusAllocFailed:
+      return "miopenStatusAllocFailed";
+    case miopenStatusBadParm:
+      return "miopenStatusBadParm";
+    case miopenStatusInternalError:
+      return "miopenStatusInternalError";
+    case miopenStatusInvalidValue:
+      return "miopenStatusInvalidValue";
+    case miopenStatusUnknownError:
+      return "miopenStatusUnknownError";
+    case miopenStatusNotImplemented:
+      return "miopenStatusNotImplemented";
+    default:
+      return "Unknown miopen error number";
+  }
+}
+
+// no use, but will have compiling error if not defined
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+
+enum class DataLayout {  // Not use
+  kNHWC,
+  kNCHW,
+  kNCDHW,
+  kNDHWC,  // add, liyamei
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kMaximumDeterministic,
+  kAverageExclusive,
+  kAverageInclusive,
+};
+
+enum class ActivationMode {
+  kNone,  // activation identity
+  kSigmoid,
+  kRelu,
+  kRelu6,
+  kReluX,
+  kTanh,
+  kBandPass,
+};
+
+inline miopenPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return miopenPoolingMax;
+    case PoolingMode::kAverageExclusive:
+      return miopenPoolingAverage;
+    case PoolingMode::kAverageInclusive:
+      return miopenPoolingAverageInclusive;
+    case PoolingMode::kMaximum:
+      return miopenPoolingMax;
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unexpected MIOPEN pooling mode."));
+  }
+}
+
+inline ActivationMode StringToActivationMode(const std::string& str) {
+  if (str == "identity") {
+    return ActivationMode::kNone;
+  } else if (str == "sigmoid") {
+    return ActivationMode::kSigmoid;
+  } else if (str == "relu") {
+    return ActivationMode::kRelu;
+  } else if (str == "relu6") {
+    return ActivationMode::kRelu6;
+  } else if (str == "relux") {
+    return ActivationMode::kReluX;
+  } else if (str == "tanh") {
+    return ActivationMode::kTanh;
+  } else if (str == "bandpass") {
+    return ActivationMode::kBandPass;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unknown MIOPEN activation string: %s.", str));
+  }
+}
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<float16> {
+ public:
+  static const miopenDataType_t type = miopenHalf;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const miopenDataType_t type = miopenFloat;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+inline miopenTensorFormat_t GetCudnnTensorFormat(const DataLayout& order) {
+  switch (order) {
+    case DataLayout::kNHWC:
+      return MIOPEN_TENSOR_NHWC;
+    case DataLayout::kNCHW:
+      return MIOPEN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return MIOPEN_TENSOR_NCHW;
+    case DataLayout::kNDHWC:
+      return MIOPEN_TENSOR_NHWC;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "MIOPEN has no equivalent dataLayout for input order."));
+  }
+  return MIOPEN_TENSOR_NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
+                                             const miopenDataType_t type,
+                                             const std::vector<int>& dims,
+                                             const int groups = 1) {
+    // the format is not used now, will add later
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    // Update tensor descriptor dims setting if groups > 1
+    // NOTE: Here, Assume using NCHW or NCDHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+
+    // MIOPEN ONLY support data layout of NCHW
+    PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
+                      platform::errors::InvalidArgument(
+                          "format should ONLY be NCHW in MIOPEN."));
+    if (dims.size() == 4) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+          desc_, type, dims_with_group.size(),
+          const_cast<int*>(dims_with_group.data()),
+          const_cast<int*>(strides.data())));
+    } else if (dims.size() == 5) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+          desc_, type, dims_with_group.size(),
+          const_cast<int*>(dims_with_group.data()),
+          const_cast<int*>(strides.data())));
+    }
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
+                                             const std::vector<int>& dims,
+                                             const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
+                      groups);
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenDataType_t miopen_type,
+                                             const std::vector<int>& dim,
+                                             const std::vector<int>& stride) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, miopen_type, dim.size(), const_cast<int*>(dim.data()),
+        const_cast<int*>(stride.data())));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const std::vector<int>& dim,
+                                             const std::vector<int>& stride) {
+    return descriptor(CudnnDataType<T>::type, dim, stride);
+  }
+
+  inline miopenTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedDropoutDescriptor {
+ public:
+  ScopedDropoutDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateDropoutDescriptor(&desc_));
+  }
+  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyDropoutDescriptor(desc_));
+  }
+
+  inline miopenDropoutDescriptor_t descriptor(const miopenHandle_t& handle,
+                                              const platform::Place& place,
+                                              bool initialized,
+                                              float dropout_prob_,
+                                              framework::Tensor* dropout_state_,
+                                              int seed, size_t state_size) {
+    if (dropout_state_ == nullptr) {  // for no dropout or test
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+          desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
+          0 /* seed */, false, false, MIOPEN_RNG_PSEUDO_XORWOW));
+      return desc_;
+    }
+    auto* dropout_state_data = dropout_state_->data<uint8_t>();
+    if (!initialized) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, seed,
+          false, false, MIOPEN_RNG_PSEUDO_XORWOW));
+    } else {
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenRestoreDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, 0,
+          false, false, MIOPEN_RNG_PSEUDO_XORWOW));
+    }
+    return desc_;
+  }
+  inline miopenDropoutDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenDropoutDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+};
+
+class ScopedRNNDescriptor {
+ public:
+  ScopedRNNDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateRNNDescriptor(&desc_));
+  }
+  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyRNNDescriptor(desc_));
+  }
+
+  inline miopenRNNDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenRNNDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
+};
+
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedFilterDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyTensorDescriptor(desc_));
+  }
+
+  inline miopenTensorDescriptor_t descriptor(const miopenTensorFormat_t format,
+                                             const miopenDataType_t type,
+                                             const std::vector<int>& kernel,
+                                             const int groups = 1) {
+    // filter layout: MCHW(MCDHW), where M is the number of
+    // output image channels, C is the number of input image channels,
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
+    std::vector<int> stride_dim(kernel_with_group.size());
+    stride_dim.push_back(1);
+    for (int k = kernel_with_group.size() - 2; k >= 0; k--) {
+      stride_dim[k] = stride_dim[k + 1] * kernel_with_group[k + 1];
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        desc_, type, kernel_with_group.size(),
+        const_cast<int*>(kernel_with_group.data()),
+        const_cast<int*>(stride_dim.data())));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenTensorDescriptor_t descriptor(const DataLayout& order,
+                                             const std::vector<int>& kernel,
+                                             const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
+                      kernel, groups);
+  }
+
+  inline miopenTensorDescriptor_t desc() { return desc_; }
+
+ private:
+  miopenTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateConvolutionDescriptor(&desc_));
+  }
+  ~ScopedConvolutionDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyConvolutionDescriptor(desc_));
+  }
+
+  inline miopenConvolutionDescriptor_t descriptor(
+      miopenDataType_t type, const std::vector<int>& pads,
+      const std::vector<int>& strides, const std::vector<int>& dilations) {
+    PADDLE_ENFORCE_EQ(pads.size(), strides.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of pads and strides should be equal. But "
+                          "received size of pads is %d, size of strides is %d.",
+                          pads.size(), strides.size()));
+    PADDLE_ENFORCE_EQ(
+        pads.size(), dilations.size(),
+        platform::errors::InvalidArgument(
+            "The size of pads and dilations should be equal. But received size "
+            "of pads is %d, size of dilations is %d.",
+            pads.size(), dilations.size()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenInitConvolutionNdDescriptor(
+        desc_, pads.size(), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data()), const_cast<int*>(dilations.data()),
+        miopenConvolution));
+    return desc_;
+  }
+
+  template <typename T>
+  inline miopenConvolutionDescriptor_t descriptor(
+      const std::vector<int>& pads, const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
+  }
+
+ private:
+  miopenConvolutionDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreatePoolingDescriptor(&desc_));
+  }
+  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyPoolingDescriptor(desc_));
+  }
+
+  inline miopenPoolingDescriptor_t descriptor(const PoolingMode& mode,
+                                              const std::vector<int>& kernel,
+                                              const std::vector<int>& pads,
+                                              const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(), pads.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of kernel and pads should be equal. But "
+                          "received size of kernel is %d, size of pads is %d.",
+                          kernel.size(), pads.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The size of kernel and strides should be equal. But "
+            "received size of kernel is %d, size of strides is %d.",
+            kernel.size(), strides.size()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet2dPoolingDescriptor(
+        desc_, GetPoolingMode(mode), kernel[0], kernel[1], pads[0], pads[1],
+        strides[0], strides[1]));
+    return desc_;
+  }
+
+ private:
+  miopenPoolingDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+class ScopedActivationDescriptor {
+ public:
+  ScopedActivationDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenCreateActivationDescriptor(&desc_));
+  }
+  ~ScopedActivationDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        dynload::miopenDestroyActivationDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline miopenActivationDescriptor_t descriptor(
+      const std::string& act, double value_max = static_cast<double>(0.)) {
+    double relu_ceiling = 0.0;
+    ActivationMode activation_mode = StringToActivationMode(act);
+    miopenActivationMode_t mode;
+    switch (activation_mode) {
+      case ActivationMode::kNone:
+        mode = miopenActivationPASTHRU;
+        break;
+      case ActivationMode::kRelu6:
+        relu_ceiling = 6.0;
+        mode = miopenActivationCLIPPEDRELU;
+        break;
+      case ActivationMode::kReluX:
+        relu_ceiling = value_max;
+        mode = miopenActivationCLIPPEDRELU;
+        break;
+      case ActivationMode::kRelu:
+        mode = miopenActivationRELU;
+        break;
+      case ActivationMode::kSigmoid:
+        mode = miopenActivationLOGISTIC;
+        break;
+      case ActivationMode::kTanh:
+        mode = miopenActivationTANH;
+        break;
+      default:
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unrecognized MIOPEN activation mode: %d.",
+            static_cast<int>(activation_mode)));
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetActivationDescriptor(
+        desc_, mode, relu_ceiling, 0.0, 0.0));
+    return desc_;
+  }
+
+ private:
+  miopenActivationDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+};
+
+inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_HIP
+  if (use_cudnn) {
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  return use_cudnn;
+}
+
+class ScopedCTCLossDescriptor {
+ public:
+  ScopedCTCLossDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenCreateCTCLossDescriptor(&desc_));
+  }
+  ~ScopedCTCLossDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenDestroyCTCLossDescriptor(desc_));
+  }
+
+  template <typename T>
+  inline miopenCTCLossDescriptor_t descriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetCTCLossDescriptor(
+        desc_, CudnnDataType<T>::type, 0, false));
+    return desc_;
+  }
+
+ private:
+  miopenCTCLossDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedCTCLossDescriptor);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/miopen_helper_test.cc b/paddle/fluid/platform/miopen_helper_test.cc
new file mode 100644
index 0000000000..e201f4893f
--- /dev/null
+++ b/paddle/fluid/platform/miopen_helper_test.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
+
+#include "paddle/fluid/platform/miopen_helper.h"
+
+#include <gtest/gtest.h>
+
+TEST(MIOpenHelper, ScopedTensorDescriptor) {
+  using paddle::platform::ScopedTensorDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedTensorDescriptor tensor_desc;
+  std::vector<int> shape = {2, 4, 6, 6};
+  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  miopenDataType_t type;
+  int nd;
+  std::vector<int> dims(4);
+  std::vector<int> strides(4);
+  paddle::platform::dynload::miopenGetTensorDescriptor(desc, &type, dims.data(),
+                                                       strides.data());
+  paddle::platform::dynload::miopenGetTensorDescriptorSize(desc, &nd);
+
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    EXPECT_EQ(dims[i], shape[i]);
+  }
+  EXPECT_EQ(strides[3], 1);
+  EXPECT_EQ(strides[2], 6);
+  EXPECT_EQ(strides[1], 36);
+  EXPECT_EQ(strides[0], 144);
+
+  // test tensor5d: ScopedTensorDescriptor
+  ScopedTensorDescriptor tensor5d_desc;
+  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
+  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
+
+  std::vector<int> dims_5d(5);
+  std::vector<int> strides_5d(5);
+  paddle::platform::dynload::miopenGetTensorDescriptor(
+      desc_5d, &type, dims_5d.data(), strides_5d.data());
+  paddle::platform::dynload::miopenGetTensorDescriptorSize(desc_5d, &nd);
+
+  EXPECT_EQ(nd, 5);
+  for (size_t i = 0; i < dims_5d.size(); ++i) {
+    EXPECT_EQ(dims_5d[i], shape_5d[i]);
+  }
+  EXPECT_EQ(strides_5d[4], 1);
+  EXPECT_EQ(strides_5d[3], 6);
+  EXPECT_EQ(strides_5d[2], 36);
+  EXPECT_EQ(strides_5d[1], 216);
+  EXPECT_EQ(strides_5d[0], 864);
+}
+
+TEST(MIOpenHelper, ScopedConvolutionDescriptor) {
+  using paddle::platform::ScopedConvolutionDescriptor;
+
+  ScopedConvolutionDescriptor conv_desc;
+  std::vector<int> src_pads = {2, 2, 2};
+  std::vector<int> src_strides = {1, 1, 1};
+  std::vector<int> src_dilations = {1, 1, 1};
+  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
+
+  miopenConvolutionMode_t mode;
+  int nd;
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  std::vector<int> dilations(3);
+  paddle::platform::dynload::miopenGetConvolutionNdDescriptor(
+      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode);
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+    EXPECT_EQ(dilations[i], src_dilations[i]);
+  }
+  EXPECT_EQ(mode, miopenConvolution);
+}
-- 
GitLab