[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)

34f1628c · Qi Li · GitHub · 5ded39f2 · 34f1628c · 34f1628c
19 changed file
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -90,7 +90,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline explicit float16(const half& h) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
@@ -366,10 +366,11 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
-// xuan[TODO] change for rocm
+// ROCM has built-in arithmetic operators as not defined
-#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+// __HIP_NO_HALF_OPERATORS__
+#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
 DEVICE inline half operator+(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hadd(a, b);
 #else
  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
@@ -378,7 +379,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 }
 DEVICE inline half operator-(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hsub(a, b);
 #else
  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
@@ -387,7 +388,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 }
 DEVICE inline half operator*(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hmul(a, b);
 #else
  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
@@ -396,7 +397,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 }
 DEVICE inline half operator/(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
@@ -407,7 +408,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
 }
 DEVICE inline half operator-(const half& a) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hneg(a);
 #else
  float res = -static_cast<float>(float16(a));
@@ -438,7 +439,7 @@ DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
 #endif
 DEVICE inline bool operator==(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __heq(a, b);
 #else
  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
@@ -446,7 +447,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 }
 DEVICE inline bool operator!=(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hne(a, b);
 #else
  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
@@ -454,7 +455,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 }
 DEVICE inline bool operator<(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(a, b);
 #else
  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
@@ -462,7 +463,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 }
 DEVICE inline bool operator<=(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hle(a, b);
 #else
  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
@@ -470,7 +471,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 }
 DEVICE inline bool operator>(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(a, b);
 #else
  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
@@ -478,7 +479,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 }
 DEVICE inline bool operator>=(const half& a, const half& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hge(a, b);
 #else
  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
@@ -489,9 +490,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 // Arithmetic operators for float16 on GPU
 #if defined(PADDLE_CUDA_FP16)
+// HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
-// HIPCC has compile error if call __device__ function __hadd in __host__
+// in __host__ __device__ function
-// __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator+(const float16& a, const float16& b) {
  return float16(__hadd(half(a), half(b)));
@@ -509,8 +509,6 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 }
 #endif
-// HIPCC has compile error if call __device__ function __hsub in __host__
-// __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a, const float16& b) {
  return float16(__hsub(half(a), half(b)));
@@ -528,8 +526,6 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 }
 #endif
-// HIPCC has compile error if call __device__ function __hmul in __host__
-// __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator*(const float16& a, const float16& b) {
  return float16(__hmul(half(a), half(b)));
@@ -547,8 +543,16 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 }
 #endif
+#if defined(__HIPCC__)
+DEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(__hdiv(half(a), half(b)));
+}
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+}
+#else
 HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  // TODO(kexinzhao): check which cuda version starts to support __hdiv
  float num = __half2float(half(a));
  float denom = __half2float(half(b));
@@ -557,9 +561,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
  return float16(static_cast<float>(a) / static_cast<float>(b));
 #endif
 }
+#endif
-// HIPCC has compile error if call __device__ function __hneg in __host__
-// __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a) {
  return float16(__hneg(half(a)));
@@ -601,8 +604,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
  return a;
 }
-// HIPCC has compile error if call __device__ function __heq in __host__
+// HIPCC has compile error if call __device__ function __heq, __hne, etc.
-// __device__ function
+// in __host__ __device__ function
 #if defined(__HIPCC__)
 DEVICE inline bool operator==(const float16& a, const float16& b) {
  return __heq(half(a), half(b));
@@ -610,7 +613,7 @@ DEVICE inline bool operator==(const float16& a, const float16& b) {
 HOST inline bool operator==(const float16& a, const float16& b) {
  return static_cast<float>(a) == static_cast<float>(b);
 }
-#else  // CUDA
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __heq(half(a), half(b));
@@ -618,47 +621,92 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
  return static_cast<float>(a) == static_cast<float>(b);
 #endif
 }
-#endif
+#endif  // __HIPCC__
+#if defined(__HIPCC__)
+DEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return __hne(half(a), half(b));
+}
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hne(half(a), half(b));
 #else
  return static_cast<float>(a) != static_cast<float>(b);
 #endif
 }
+#endif  // __HIPCC__
+#if defined(__HIPCC__)
+DEVICE inline bool operator<(const float16& a, const float16& b) {
+  return __hlt(half(a), half(b));
+}
+HOST inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(half(a), half(b));
 #else
  return static_cast<float>(a) < static_cast<float>(b);
 #endif
 }
+#endif  // __HIPCC__
+#if defined(__HIPCC__)
+DEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return __hle(half(a), half(b));
+}
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hle(half(a), half(b));
 #else
  return static_cast<float>(a) <= static_cast<float>(b);
 #endif
 }
+#endif  // __HIPCC__
+#if defined(__HIPCC__)
+DEVICE inline bool operator>(const float16& a, const float16& b) {
+  return __hgt(half(a), half(b));
+}
+HOST inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(half(a), half(b));
 #else
  return static_cast<float>(a) > static_cast<float>(b);
 #endif
 }
+#endif  // __HIPCC__
+#if defined(__HIPCC__)
+DEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return __hge(half(a), half(b));
+}
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+#else  // __HIPCC__
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hge(half(a), half(b));
 #else
  return static_cast<float>(a) >= static_cast<float>(b);
 #endif
 }
+#endif  // __HIPCC__
 // Arithmetic operators for float16 on ARMv8.2-A CPU
 #elif defined(PADDLE_WITH_NATIVE_FP16)

--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -22,30 +22,109 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+  __global__ void op_type(const half *in1, const half *in2, half *out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }
 #define COMPOUND_KERNEL(op_type, sign) \
-  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+  __global__ void op_type(half *in1, const half *in2) { in1[0] sign in2[0]; }
 #define COMPARISON_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+  __global__ void op_type(const half *in1, const half *in2, bool *out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }
+#ifdef PADDLE_WITH_HIP
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) {                 \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                           \
+    half *in1, *in2, *out;                                                    \
+    half *d_in1, *d_in2, *d_out;                                              \
+    int size = sizeof(half);                                                  \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_out), size);                       \
+    in1 = reinterpret_cast<half *>(malloc(size));                             \
+    in2 = reinterpret_cast<half *>(malloc(size));                             \
+    out = reinterpret_cast<half *>(malloc(size));                             \
+    in1[0] = half(float16(v_in1));                                            \
+    in2[0] = half(float16(v_in2));                                            \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
+    hipMemcpy(out, d_out, size, hipMemcpyDeviceToHost);                       \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);                    \
+    free(in1);                                                                \
+    free(in2);                                                                \
+    free(out);                                                                \
+    hipFree(d_in1);                                                           \
+    hipFree(d_in2);                                                           \
+    hipFree(d_out);                                                           \
+  }
+#define COMPOUND_KERNEL_LAUNCH(op_type)                                \
+  void Test##op_type(float v_in1, float v_in2, float v_out) {          \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                    \
+    half *in1, *in2;                                                   \
+    half *d_in1, *d_in2;                                               \
+    int size = sizeof(half);                                           \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                \
+    in1 = reinterpret_cast<half *>(malloc(size));                      \
+    in2 = reinterpret_cast<half *>(malloc(size));                      \
+    in1[0] = half(float16(v_in1));                                     \
+    in2[0] = half(float16(v_in2));                                     \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
+    hipMemcpy(in1, d_in1, size, hipMemcpyDeviceToHost);                \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);             \
+    free(in1);                                                         \
+    free(in2);                                                         \
+    hipFree(d_in1);                                                    \
+    hipFree(d_in2);                                                    \
+  }
+#define COMPARISON_KERNEL_LAUNCH(op_type)                                     \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) {                  \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";                           \
+    half *in1, *in2;                                                          \
+    half *d_in1, *d_in2;                                                      \
+    bool *out, *d_out;                                                        \
+    int size = sizeof(half);                                                  \
+    hipMalloc(reinterpret_cast<void **>(&d_in1), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_in2), size);                       \
+    hipMalloc(reinterpret_cast<void **>(&d_out), 1);                          \
+    in1 = reinterpret_cast<half *>(malloc(size));                             \
+    in2 = reinterpret_cast<half *>(malloc(size));                             \
+    out = reinterpret_cast<bool *>(malloc(1));                                \
+    in1[0] = half(float16(v_in1));                                            \
+    in2[0] = half(float16(v_in2));                                            \
+    hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
+    hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
+    hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
+    hipMemcpy(out, d_out, 1, hipMemcpyDeviceToHost);                          \
+    EXPECT_EQ(out[0], v_out);                                                 \
+    free(in1);                                                                \
+    free(in2);                                                                \
+    free(out);                                                                \
+    hipFree(d_in1);                                                           \
+    hipFree(d_in2);                                                           \
+    hipFree(d_out);                                                           \
+  }
+#else
 #define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
  void Test##op_type(float v_in1, float v_in2, float v_out) { \
    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
    half *in1, *in2, *out;                                    \
    half *d_in1, *d_in2, *d_out;                              \
    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
+    cudaMalloc(reinterpret_cast<void **>(&d_out), size);      \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in1 = reinterpret_cast<half *>(malloc(size));             \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half *>(malloc(size));             \
-    out = reinterpret_cast<half*>(malloc(size));              \
+    out = reinterpret_cast<half *>(malloc(size));             \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
@@ -67,10 +146,10 @@ limitations under the License. */
    half *in1, *in2;                                          \
    half *d_in1, *d_in2;                                      \
    int size = sizeof(half);                                  \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);      \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
-    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in1 = reinterpret_cast<half *>(malloc(size));             \
-    in2 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half *>(malloc(size));             \
    in1[0] = half(float16(v_in1));                            \
    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
@@ -91,12 +170,12 @@ limitations under the License. */
    half *d_in1, *d_in2;                                     \
    bool *out, *d_out;                                       \
    int size = sizeof(half);                                 \
-    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void **>(&d_in1), size);     \
-    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
+    cudaMalloc(reinterpret_cast<void **>(&d_in2), size);     \
-    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
+    cudaMalloc(reinterpret_cast<void **>(&d_out), 1);        \
-    in1 = reinterpret_cast<half*>(malloc(size));             \
+    in1 = reinterpret_cast<half *>(malloc(size));            \
-    in2 = reinterpret_cast<half*>(malloc(size));             \
+    in2 = reinterpret_cast<half *>(malloc(size));            \
-    out = reinterpret_cast<bool*>(malloc(1));                \
+    out = reinterpret_cast<bool *>(malloc(1));               \
    in1[0] = half(float16(v_in1));                           \
    in2[0] = half(float16(v_in2));                           \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@@ -111,12 +190,14 @@ limitations under the License. */
    cudaFree(d_in2);                                         \
    cudaFree(d_out);                                         \
  }
+#endif
 #ifdef PADDLE_CUDA_FP16
 namespace paddle {
 namespace platform {
-#if CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP) || \
+    (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
@@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
 ARITHMETIC_KERNEL_LAUNCH(Div)
 // Negative sign kernel
-__global__ void Neg(half* in) { in[0] = -in[0]; }
+__global__ void Neg(half *in) { in[0] = -in[0]; }
 void TestNeg(float v_in, float v_out) {
  LOG(INFO) << "Test Neg on GPU!";
  half *in, *d_in;
  int size = sizeof(half);
-  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
+#ifdef PADDLE_WITH_HIP
-  in = reinterpret_cast<half*>(malloc(size));
+  hipMalloc(reinterpret_cast<void **>(&d_in), size);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_in), size);
+#endif
+  in = reinterpret_cast<half *>(malloc(size));
  in[0] = half(float16(v_in));
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
+#else
  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+#endif
  Neg<<<1, 1>>>(d_in);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(in, d_in, size, hipMemcpyDeviceToHost);
+#else
  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+#endif
  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
  free(in);
+#ifdef PADDLE_WITH_HIP
+  hipFree(d_in);
+#else
  cudaFree(d_in);
+#endif
 }
 COMPOUND_KERNEL(AddAssign, +=)
@@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) {
  framework::LoDTensor gpu_tensor;
  framework::LoDTensor dst_tensor;
-  float16* src_ptr = src_tensor.mutable_data<float16>(
+  float16 *src_ptr = src_tensor.mutable_data<float16>(
      framework::make_ddim({2, 2}), CPUPlace());
  float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
@@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) {
  // Sync before comparing LoDTensors
  gpu_ctx.Wait();
-  const float16* dst_ptr = dst_tensor.data<float16>();
+  const float16 *dst_ptr = dst_tensor.data<float16>();
  ASSERT_NE(src_ptr, dst_ptr);
  for (size_t i = 0; i < 4; ++i) {
    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
@@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) {
 template <typename T>
 struct Functor {
-  bool operator()(const T& val) {
+  bool operator()(const T &val) {
    return std::type_index(typeid(T)) ==
           std::type_index(typeid(platform::float16));
  }
@@ -304,13 +401,13 @@ TEST(float16, cast) {
  auto b = a;
  {
    // change semantic, keep the same value
-    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    float16 c = reinterpret_cast<float16 &>(reinterpret_cast<unsigned &>(b));
    EXPECT_EQ(b, c);
  }
  {
    // use uint32 low 16 bit store float16
-    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
    float16 d;
    d.x = c;
    EXPECT_EQ(b, d);

--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include <arpa/inet.h>
@@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
  template void RecvBroadCastCommID<Type>(std::string endpoint,             \
                                          std::vector<Type> * nccl_ids);
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 INSTANT_TEMPLATE(ncclUniqueId)
 #endif
 #ifdef PADDLE_WITH_XPU_BKCL

--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -14,7 +14,8 @@ limitations under the License. */
 #pragma once
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 #include <functional>
 #include <string>
 #include <vector>

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -17,7 +17,11 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/dynload/miopen.h"
+#else
 #include "paddle/fluid/platform/dynload/cudnn.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 #include "paddle/fluid/platform/macros.h"
@@ -40,19 +44,34 @@ namespace platform {
 int CudnnVersion() {
  if (!dynload::HasCUDNN()) return -1;
+#ifdef PADDLE_WITH_HIP
+  size_t version_major, version_minor, version_patch;
+  PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
+      &version_major, &version_minor, &version_patch));
+  return version_major * 100 + version_minor * 10 + version_patch;
+#else
  return dynload::cudnnGetVersion();
+#endif
 }
 static int GetCUDADeviceCountImpl() {
  int driverVersion = 0;
+#ifdef PADDLE_WITH_HIP
+  hipError_t status = hipDriverGetVersion(&driverVersion);
+#else
  cudaError_t status = cudaDriverGetVersion(&driverVersion);
+#endif
-  if (!(status == cudaSuccess && driverVersion != 0)) {
+  if (!(status == gpuSuccess && driverVersion != 0)) {
    // No GPU driver
    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
    return 0;
  }
+#ifdef PADDLE_WITH_HIP
+  const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
+#else
  const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
+#endif
  if (cuda_visible_devices != nullptr) {
    std::string cuda_visible_devices_str(cuda_visible_devices);
    if (!cuda_visible_devices_str.empty()) {
@@ -68,12 +87,17 @@ static int GetCUDADeviceCountImpl() {
    if (std::all_of(cuda_visible_devices_str.begin(),
                    cuda_visible_devices_str.end(),
                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected.";
+      VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
      return 0;
    }
  }
  int count;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
+#endif
  return count;
 }
@@ -94,13 +118,24 @@ int GetCUDAComputeCapability(int id) {
                        id, GetCUDADeviceCount()));
  int major, minor;
+#ifdef PADDLE_WITH_HIP
+  auto major_error_code = hipDeviceGetAttribute(
+      &major, hipDeviceAttributeComputeCapabilityMajor, id);
+  auto minor_error_code = hipDeviceGetAttribute(
+      &minor, hipDeviceAttributeComputeCapabilityMinor, id);
+#else
  auto major_error_code =
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
  auto minor_error_code =
      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
  PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
+#ifdef PADDLE_WITH_HIP
+  return major * 100 + minor;
+#else
  return major * 10 + minor;
+#endif
 }
 dim3 GetGpuMaxGridDimSize(int id) {
@@ -111,15 +146,30 @@ dim3 GetGpuMaxGridDimSize(int id) {
                        id, GetCUDADeviceCount()));
  dim3 ret;
  int size;
+#ifdef PADDLE_WITH_HIP
+  auto error_code_x =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
+#else
  auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
  ret.x = size;
+#ifdef PADDLE_WITH_HIP
+  auto error_code_y =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
+#else
  auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
  ret.y = size;
+#ifdef PADDLE_WITH_HIP
+  auto error_code_z =
+      hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
+#else
  auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
  ret.z = size;
  return ret;
@@ -132,7 +182,11 @@ int GetCUDARuntimeVersion(int id) {
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int runtime_version = 0;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
+#endif
  return runtime_version;
 }
@@ -143,12 +197,16 @@ int GetCUDADriverVersion(int id) {
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int driver_version = 0;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
+#endif
  return driver_version;
 }
 bool TensorCoreAvailable() {
-#if CUDA_VERSION >= 9000
+#if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
  int device = GetCurrentDeviceId();
  int driver_version = GetCUDAComputeCapability(device);
  return driver_version >= 70;
@@ -164,8 +222,13 @@ int GetCUDAMultiProcessors(int id) {
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int count;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
+#endif
  return count;
 }
@@ -176,8 +239,13 @@ int GetCUDAMaxThreadsPerMultiProcessor(int id) {
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int count;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
+      &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
      &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
+#endif
  return count;
 }
@@ -188,14 +256,23 @@ int GetCUDAMaxThreadsPerBlock(int id) {
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
  int count;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
+#endif
  return count;
 }
 int GetCurrentDeviceId() {
  int device_id;
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
+#endif
  return device_id;
 }
@@ -224,7 +301,11 @@ void SetDeviceId(int id) {
                        "Device id must be less than GPU count, "
                        "but received id is: %d. GPU count is: %d.",
                        id, GetCUDADeviceCount()));
+#ifdef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
+#else
  PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
+#endif
 }
 void GpuMemoryUsage(size_t *available, size_t *total) {
@@ -289,46 +370,91 @@ size_t GpuMaxChunkSize() {
  return max_chunk_size;
 }
+#ifdef PADDLE_WITH_HIP
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum hipMemcpyKind kind, hipStream_t stream) {
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
+}
+#else
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
 }
+#endif
+#ifdef PADDLE_WITH_HIP
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum hipMemcpyKind kind) {
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
+}
+#else
 void GpuMemcpySync(void *dst, const void *src, size_t count,
                   enum cudaMemcpyKind kind) {
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
 }
+#endif
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, cudaStream_t stream) {
+                        int src_device, size_t count, gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+#endif
 }
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipMemcpyPeer(dst, dst_device, src, src_device, count));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaMemcpyPeer(dst, dst_device, src, src_device, count));
+#endif
 }
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
+#endif
 }
-void GpuStreamSync(cudaStream_t stream) {
+void GpuStreamSync(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
 }
-static void RaiseNonOutOfMemoryError(cudaError_t *status) {
+static void RaiseNonOutOfMemoryError(gpuError_t *status) {
+#ifdef PADDLE_WITH_HIP
+  if (*status == hipErrorOutOfMemory) {
+    *status = hipSuccess;
+  }
+#else
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
+#ifdef PADDLE_WITH_HIP
+  *status = hipGetLastError();
+  if (*status == hipErrorOutOfMemory) {
+    *status = hipSuccess;
+  }
+#else
  *status = cudaGetLastError();
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
  }
+#endif
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }
@@ -370,26 +496,38 @@ class RecordedCudaMallocHelper {
   * or cudaSuccess would be returned, and the cudaGetLastError() flag
   * would be clear.
   */
-  cudaError_t Malloc(void **ptr, size_t size) {
+  gpuError_t Malloc(void **ptr, size_t size) {
    LockGuardPtr<std::mutex> lock(mtx_);
    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+#ifdef PADDLE_WITH_HIP
+      return hipErrorOutOfMemory;
+#else
      return cudaErrorMemoryAllocation;
+#endif
    }
    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto result = hipMalloc(ptr, size);
+#else
    auto result = cudaMalloc(ptr, size);
-    if (result == cudaSuccess) {
+#endif
+    if (result == gpuSuccess) {
      if (NeedRecord()) {
        cur_size_ += size;
      }
      STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      return cudaSuccess;
+      return gpuSuccess;
    } else {
      RaiseNonOutOfMemoryError(&result);
-      // Non out of memory error would be raised inside
+// Non out of memory error would be raised inside
-      // RaiseNonOutOfMemoryError. Therefore, we can
+// RaiseNonOutOfMemoryError. Therefore, we can
-      // return cudaErrorMemoryAllocation directly here.
+// return cudaErrorMemoryAllocation directly here.
+#ifdef PADDLE_WITH_HIP
+      return hipErrorOutOfMemory;
+#else
      return cudaErrorMemoryAllocation;
+#endif
    }
  }
@@ -404,8 +542,13 @@ class RecordedCudaMallocHelper {
    // process is terminating, in which case we don't care if
    // cudaFree succeeds.
    CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+    auto err = hipFree(ptr);
+    if (err != hipErrorDeinitialized) {
+#else
    auto err = cudaFree(ptr);
    if (err != cudaErrorCudartUnloading) {
+#endif
      PADDLE_ENFORCE_CUDA_SUCCESS(err);
      if (NeedRecord()) {
        std::lock_guard<std::mutex> guard(*mtx_);
@@ -413,7 +556,11 @@ class RecordedCudaMallocHelper {
      }
      STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
    } else {
+#ifdef PADDLE_WITH_HIP
+      hipGetLastError();  // clear the error flag when hipErrorDeinitialized
+#else
      cudaGetLastError();  // clear the error flag when cudaErrorCudartUnloading
+#endif
    }
  }
@@ -421,8 +568,12 @@ class RecordedCudaMallocHelper {
                  size_t *actual_total) {
    {
      CUDADeviceGuard guard(dev_id_);
+#ifdef PADDLE_WITH_HIP
+      auto result = hipMemGetInfo(actual_avail, actual_total);
+#else
      auto result = cudaMemGetInfo(actual_avail, actual_total);
-      if (result != cudaSuccess) {
+#endif
+      if (result != gpuSuccess) {
        *actual_avail = 0;
      }
      RaiseNonOutOfMemoryError(&result);
@@ -458,13 +609,13 @@ class RecordedCudaMallocHelper {
  static std::once_flag once_flag_;
  static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
-};
+};  // NOLINT
 std::once_flag RecordedCudaMallocHelper::once_flag_;
 std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
    RecordedCudaMallocHelper::instances_;
-cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
+gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
  return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
 }

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -15,11 +15,19 @@ limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// Note: this header for simplify HIP and CUDA type string
 #include <stddef.h>
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/type_defs.h"
 namespace paddle {
 namespace platform {
@@ -86,28 +94,36 @@ size_t GpuMaxChunkSize();
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+#ifdef PADDLE_WITH_HIP
+                    enum hipMemcpyKind kind, hipStream_t stream);
+#else
                    enum cudaMemcpyKind kind, cudaStream_t stream);
+#endif
 //! Copy memory from address src to dst synchronously.
 void GpuMemcpySync(void *dst, const void *src, size_t count,
+#ifdef PADDLE_WITH_HIP
+                   enum hipMemcpyKind kind);
+#else
                   enum cudaMemcpyKind kind);
+#endif
 //! Copy memory from one device to another device asynchronously.
 void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, cudaStream_t stream);
+                        int src_device, size_t count, gpuStream_t stream);
 //! Copy memory from one device to another device synchronously.
 void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
                       int src_device, size_t count);
 //! Set memory dst with value count size asynchronously
-void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
 //! Blocks until stream has completed all operations.
-void GpuStreamSync(cudaStream_t stream);
+void GpuStreamSync(gpuStream_t stream);
 //! CudaMalloc with recorded info
-cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
+gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
 //! CudaFree with recorded info
 void RecordedCudaFree(void *p, size_t size, int dev_id);

--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -16,9 +16,13 @@
 #pragma once
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
 #include <stddef.h>
 #include <algorithm>
 #include <string>

--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,7 +14,7 @@
 #pragma once
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <stdio.h>
 #include <memory>
 #include <string>
@@ -25,7 +25,12 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/collective_helper.h"
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -81,7 +86,7 @@ struct NCCLContext {
  explicit NCCLContext(int dev_id)
      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
-  cudaStream_t stream() const { return ctx_->stream(); }
+  gpuStream_t stream() const { return ctx_->stream(); }
  ncclComm_t comm() const { return comm_; }
  int device_id() const {

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -154,7 +154,7 @@ struct PlaceVisitorWrapper
  }
  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    return visitor_(cuda);
 #else
    PADDLE_THROW(platform::errors::Unavailable(
@@ -165,7 +165,7 @@ struct PlaceVisitorWrapper
  typename Visitor::result_type operator()(
      const CUDAPinnedPlace &cuda_pinned) const {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    return visitor_(cuda_pinned);
 #else
    PADDLE_THROW(platform::errors::Unavailable(

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) {
  g_state = state;
  should_send_profile_state = true;
  GetDeviceTracer()->Enable();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
      g_state == ProfilerState::kCPU) {
    // Generate some dummy events first to reduce the startup overhead.

--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) {
 }
 void DummyKernelAndEvent() {
+#ifdef PADDLE_WITH_HIP
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      hipStream_t stream;
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+      Mark("_cuda_startup_");
+      int *ptr;
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
+      hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
+    });
+  }
+#else
  for (int i = 0; i < 5; i++) {
    ForEachDevice([](int d) {
      platform::SetDeviceId(d);
@@ -44,6 +66,7 @@ void DummyKernelAndEvent() {
      PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
    });
  }
+#endif
 }
 }  // namespace platform

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 namespace paddle {
@@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
                   const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void DummyKernelAndEvent();
 #endif

--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -31,6 +31,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 namespace paddle {
 namespace platform {
@@ -122,6 +125,13 @@ void SynchronizeAllDevice() {
    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
  }
 #endif
+#ifdef PADDLE_WITH_HIP
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+  }
+#endif
 }
 // Print results
@@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
    if (rit != pushed_events->rend()) {
      double event_time = 0;
      double gpu_time = 0.0f;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
      double cpu_time = rit->CpuElapsedMs(analyze_event);

--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
      if (events[i][j].name() == "push") {
        EXPECT_EQ(events[i][j + 1].name(), "pop");
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
 #else
        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
@@ -146,3 +146,13 @@ TEST(TMP, stream_wait) {
  cudaStreamSynchronize(stream);
 }
 #endif
+#ifdef PADDLE_WITH_HIP
+TEST(TMP, stream_wait) {
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+  hipStreamSynchronize(stream);
+  hipStreamSynchronize(stream);
+  hipStreamSynchronize(stream);
+}
+#endif
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -18,7 +18,10 @@
 namespace paddle {
 namespace platform {
-#if CUDA_VERSION >= 10000
+#ifdef PADDLE_WITH_HIP
+static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
+                               void *user_data)
+#elif CUDA_VERSION >= 10000
 static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
 static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
@@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
  (*func)();
 }
-StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
    : stream_(stream), thread_pool_(1) {}
 void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
@@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
      (*callback_func)();
    });
  });
-#if CUDA_VERSION >= 10000
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+#elif CUDA_VERSION >= 10000
  PADDLE_ENFORCE_CUDA_SUCCESS(
      cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
@@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
 }
 void StreamCallbackManager::Wait() const {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
+#else
  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#endif
  {
    std::lock_guard<std::mutex> lock(mtx_);
    if (last_future_.valid()) {

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -15,8 +15,16 @@
 #pragma once
 #include <ThreadPool.h>
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
@@ -31,7 +39,7 @@ namespace platform {
 // Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
 public:
-  explicit StreamCallbackManager(const cudaStream_t stream);
+  explicit StreamCallbackManager(const gpuStream_t stream);
  ~StreamCallbackManager() = default;
@@ -40,7 +48,7 @@ class StreamCallbackManager {
  void Wait() const;
 private:
-  const cudaStream_t stream_;
+  const gpuStream_t stream_;
  mutable ::ThreadPool thread_pool_;
  mutable std::mutex mtx_;
  mutable std::future<void> last_future_;

--- a/paddle/fluid/platform/test_limit_gpu_memory.cu
+++ b/paddle/fluid/platform/test_limit_gpu_memory.cu
@@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
    RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
                           DEVICE_ID);
    ASSERT_EQ(total, limit);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
  }
  {
    CUDADeviceGuard guard(DEVICE_ID);
    GpuMemoryUsage(&avail, &total);
    ASSERT_EQ(total, limit);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
  }
-  cudaError_t err = cudaSuccess;
+  gpuError_t err = gpuSuccess;
  void *p1 = nullptr;
  size_t size1 = limit / 4 * 3;
  {
    err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
-    ASSERT_EQ(err, cudaSuccess);
+    ASSERT_EQ(err, gpuSuccess);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
    ASSERT_NE(p1, nullptr);
    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
  size_t size2 = limit / 2;
  {
    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(err, hipErrorOutOfMemory);
+    ASSERT_EQ(hipGetLastError(), gpuSuccess);
+#else
    ASSERT_EQ(err, cudaErrorMemoryAllocation);
-    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+    ASSERT_EQ(cudaGetLastError(), gpuSuccess);
+#endif
    ASSERT_EQ(p2, nullptr);
    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
@@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
  {
    err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
-    ASSERT_EQ(err, cudaSuccess);
+    ASSERT_EQ(err, gpuSuccess);
+#ifdef PADDLE_WITH_HIP
+    ASSERT_EQ(hipGetLastError(), hipSuccess);
+#else
    ASSERT_EQ(cudaGetLastError(), cudaSuccess);
+#endif
    ASSERT_NE(p2, nullptr);
    ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
  }

--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
@@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> {
  }
 };
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <>
 struct Transform<platform::CUDADeviceContext> {
  template <typename InputIter, typename OutputIter, typename UnaryOperation>
@@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> {
    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                      platform::errors::PreconditionNotMet(
                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
+#else
    thrust::transform(thrust::cuda::par.on(context.stream()),
                      details::CastToCUDATransformIterator(first),
                      details::CastToCUDATransformIterator(last),
                      details::CastToCUDATransformIterator(result), op);
+#endif
  }
  template <typename InputIter1, typename InputIter2, typename OutputIter,
@@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> {
    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
                      platform::errors::PreconditionNotMet(
                          "The CUDA Transform must be used in GPU place."));
+#ifdef __HIPCC__
+    thrust::transform(thrust::hip::par.on(context.stream()),
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
+#else
    thrust::transform(thrust::cuda::par.on(context.stream()),
                      details::CastToCUDATransformIterator(first1),
                      details::CastToCUDATransformIterator(last1),
                      details::CastToCUDATransformIterator(first2),
                      details::CastToCUDATransformIterator(result), op);
+#endif
  }
 };
 #endif

--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -32,7 +32,7 @@ limitations under the License. */
 // BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
 // function symbols.  For details,
 // https://github.com/PaddlePaddle/Paddle/issues/3386
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif