未验证 提交 34f1628c 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid platform for rocm39 (part2), test=develop (#30774)

上级 5ded39f2
...@@ -90,7 +90,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -90,7 +90,7 @@ struct PADDLE_ALIGN(2) float16 {
// Constructors // Constructors
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit float16(const half& h) { HOSTDEVICE inline explicit float16(const half& h) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x; x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
#else #else
...@@ -366,10 +366,11 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -366,10 +366,11 @@ struct PADDLE_ALIGN(2) float16 {
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type. // CUDA 9.0 regarding the half data type.
// xuan[TODO] change for rocm // ROCM has built-in arithmetic operators as not defined
#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 // __HIP_NO_HALF_OPERATORS__
#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) { DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd(a, b); return __hadd(a, b);
#else #else
float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
...@@ -378,7 +379,7 @@ DEVICE inline half operator+(const half& a, const half& b) { ...@@ -378,7 +379,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
} }
DEVICE inline half operator-(const half& a, const half& b) { DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub(a, b); return __hsub(a, b);
#else #else
float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
...@@ -387,7 +388,7 @@ DEVICE inline half operator-(const half& a, const half& b) { ...@@ -387,7 +388,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
} }
DEVICE inline half operator*(const half& a, const half& b) { DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul(a, b); return __hmul(a, b);
#else #else
float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
...@@ -396,7 +397,7 @@ DEVICE inline half operator*(const half& a, const half& b) { ...@@ -396,7 +397,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
} }
DEVICE inline half operator/(const half& a, const half& b) { DEVICE inline half operator/(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
float num = __half2float(a); float num = __half2float(a);
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
...@@ -407,7 +408,7 @@ DEVICE inline half operator/(const half& a, const half& b) { ...@@ -407,7 +408,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
} }
DEVICE inline half operator-(const half& a) { DEVICE inline half operator-(const half& a) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hneg(a); return __hneg(a);
#else #else
float res = -static_cast<float>(float16(a)); float res = -static_cast<float>(float16(a));
...@@ -438,7 +439,7 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT ...@@ -438,7 +439,7 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT
#endif #endif
DEVICE inline bool operator==(const half& a, const half& b) { DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(a, b); return __heq(a, b);
#else #else
return static_cast<float>(float16(a)) == static_cast<float>(float16(b)); return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
...@@ -446,7 +447,7 @@ DEVICE inline bool operator==(const half& a, const half& b) { ...@@ -446,7 +447,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
} }
DEVICE inline bool operator!=(const half& a, const half& b) { DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(a, b); return __hne(a, b);
#else #else
return static_cast<float>(float16(a)) != static_cast<float>(float16(b)); return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
...@@ -454,7 +455,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) { ...@@ -454,7 +455,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
} }
DEVICE inline bool operator<(const half& a, const half& b) { DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(a, b); return __hlt(a, b);
#else #else
return static_cast<float>(float16(a)) < static_cast<float>(float16(b)); return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
...@@ -462,7 +463,7 @@ DEVICE inline bool operator<(const half& a, const half& b) { ...@@ -462,7 +463,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
} }
DEVICE inline bool operator<=(const half& a, const half& b) { DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(a, b); return __hle(a, b);
#else #else
return static_cast<float>(float16(a)) <= static_cast<float>(float16(b)); return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
...@@ -470,7 +471,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) { ...@@ -470,7 +471,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
} }
DEVICE inline bool operator>(const half& a, const half& b) { DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(a, b); return __hgt(a, b);
#else #else
return static_cast<float>(float16(a)) > static_cast<float>(float16(b)); return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
...@@ -478,7 +479,7 @@ DEVICE inline bool operator>(const half& a, const half& b) { ...@@ -478,7 +479,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
} }
DEVICE inline bool operator>=(const half& a, const half& b) { DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(a, b); return __hge(a, b);
#else #else
return static_cast<float>(float16(a)) >= static_cast<float>(float16(b)); return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
...@@ -489,9 +490,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) { ...@@ -489,9 +490,8 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
// Arithmetic operators for float16 on GPU // Arithmetic operators for float16 on GPU
#if defined(PADDLE_CUDA_FP16) #if defined(PADDLE_CUDA_FP16)
// HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
// HIPCC has compile error if call __device__ function __hadd in __host__ // in __host__ __device__ function
// __device__ function
#if defined(__HIPCC__) #if defined(__HIPCC__)
DEVICE inline float16 operator+(const float16& a, const float16& b) { DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b))); return float16(__hadd(half(a), half(b)));
...@@ -509,8 +509,6 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { ...@@ -509,8 +509,6 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
} }
#endif #endif
// HIPCC has compile error if call __device__ function __hsub in __host__
// __device__ function
#if defined(__HIPCC__) #if defined(__HIPCC__)
DEVICE inline float16 operator-(const float16& a, const float16& b) { DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b))); return float16(__hsub(half(a), half(b)));
...@@ -528,8 +526,6 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { ...@@ -528,8 +526,6 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
} }
#endif #endif
// HIPCC has compile error if call __device__ function __hmul in __host__
// __device__ function
#if defined(__HIPCC__) #if defined(__HIPCC__)
DEVICE inline float16 operator*(const float16& a, const float16& b) { DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b))); return float16(__hmul(half(a), half(b)));
...@@ -547,8 +543,16 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { ...@@ -547,8 +543,16 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
} }
#endif #endif
#if defined(__HIPCC__)
DEVICE inline float16 operator/(const float16& a, const float16& b) {
return float16(__hdiv(half(a), half(b)));
}
HOST inline float16 operator/(const float16& a, const float16& b) {
return float16(static_cast<float>(a) / static_cast<float>(b));
}
#else
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
// TODO(kexinzhao): check which cuda version starts to support __hdiv // TODO(kexinzhao): check which cuda version starts to support __hdiv
float num = __half2float(half(a)); float num = __half2float(half(a));
float denom = __half2float(half(b)); float denom = __half2float(half(b));
...@@ -557,9 +561,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { ...@@ -557,9 +561,8 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
return float16(static_cast<float>(a) / static_cast<float>(b)); return float16(static_cast<float>(a) / static_cast<float>(b));
#endif #endif
} }
#endif
// HIPCC has compile error if call __device__ function __hneg in __host__
// __device__ function
#if defined(__HIPCC__) #if defined(__HIPCC__)
DEVICE inline float16 operator-(const float16& a) { DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a))); return float16(__hneg(half(a)));
...@@ -601,8 +604,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT ...@@ -601,8 +604,8 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT
return a; return a;
} }
// HIPCC has compile error if call __device__ function __heq in __host__ // HIPCC has compile error if call __device__ function __heq, __hne, etc.
// __device__ function // in __host__ __device__ function
#if defined(__HIPCC__) #if defined(__HIPCC__)
DEVICE inline bool operator==(const float16& a, const float16& b) { DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b)); return __heq(half(a), half(b));
...@@ -610,7 +613,7 @@ DEVICE inline bool operator==(const float16& a, const float16& b) { ...@@ -610,7 +613,7 @@ DEVICE inline bool operator==(const float16& a, const float16& b) {
HOST inline bool operator==(const float16& a, const float16& b) { HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<float>(a) == static_cast<float>(b); return static_cast<float>(a) == static_cast<float>(b);
} }
#else // CUDA #else // __HIPCC__
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(half(a), half(b)); return __heq(half(a), half(b));
...@@ -618,47 +621,92 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { ...@@ -618,47 +621,92 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
return static_cast<float>(a) == static_cast<float>(b); return static_cast<float>(a) == static_cast<float>(b);
#endif #endif
} }
#endif #endif // __HIPCC__
#if defined(__HIPCC__)
DEVICE inline bool operator!=(const float16& a, const float16& b) {
return __hne(half(a), half(b));
}
HOST inline bool operator!=(const float16& a, const float16& b) {
return static_cast<float>(a) != static_cast<float>(b);
}
#else // __HIPCC__
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(half(a), half(b)); return __hne(half(a), half(b));
#else #else
return static_cast<float>(a) != static_cast<float>(b); return static_cast<float>(a) != static_cast<float>(b);
#endif #endif
} }
#endif // __HIPCC__
#if defined(__HIPCC__)
DEVICE inline bool operator<(const float16& a, const float16& b) {
return __hlt(half(a), half(b));
}
HOST inline bool operator<(const float16& a, const float16& b) {
return static_cast<float>(a) < static_cast<float>(b);
}
#else // __HIPCC__
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(half(a), half(b)); return __hlt(half(a), half(b));
#else #else
return static_cast<float>(a) < static_cast<float>(b); return static_cast<float>(a) < static_cast<float>(b);
#endif #endif
} }
#endif // __HIPCC__
#if defined(__HIPCC__)
DEVICE inline bool operator<=(const float16& a, const float16& b) {
return __hle(half(a), half(b));
}
HOST inline bool operator<=(const float16& a, const float16& b) {
return static_cast<float>(a) <= static_cast<float>(b);
}
#else // __HIPCC__
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(half(a), half(b)); return __hle(half(a), half(b));
#else #else
return static_cast<float>(a) <= static_cast<float>(b); return static_cast<float>(a) <= static_cast<float>(b);
#endif #endif
} }
#endif // __HIPCC__
#if defined(__HIPCC__)
DEVICE inline bool operator>(const float16& a, const float16& b) {
return __hgt(half(a), half(b));
}
HOST inline bool operator>(const float16& a, const float16& b) {
return static_cast<float>(a) > static_cast<float>(b);
}
#else // __HIPCC__
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(half(a), half(b)); return __hgt(half(a), half(b));
#else #else
return static_cast<float>(a) > static_cast<float>(b); return static_cast<float>(a) > static_cast<float>(b);
#endif #endif
} }
#endif // __HIPCC__
#if defined(__HIPCC__)
DEVICE inline bool operator>=(const float16& a, const float16& b) {
return __hge(half(a), half(b));
}
HOST inline bool operator>=(const float16& a, const float16& b) {
return static_cast<float>(a) >= static_cast<float>(b);
}
#else // __HIPCC__
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(half(a), half(b)); return __hge(half(a), half(b));
#else #else
return static_cast<float>(a) >= static_cast<float>(b); return static_cast<float>(a) >= static_cast<float>(b);
#endif #endif
} }
#endif // __HIPCC__
// Arithmetic operators for float16 on ARMv8.2-A CPU // Arithmetic operators for float16 on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16) #elif defined(PADDLE_WITH_NATIVE_FP16)
......
...@@ -22,30 +22,109 @@ limitations under the License. */ ...@@ -22,30 +22,109 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#define ARITHMETIC_KERNEL(op_type, sign) \ #define ARITHMETIC_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, half* out) { \ __global__ void op_type(const half *in1, const half *in2, half *out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#define COMPOUND_KERNEL(op_type, sign) \ #define COMPOUND_KERNEL(op_type, sign) \
__global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; } __global__ void op_type(half *in1, const half *in2) { in1[0] sign in2[0]; }
#define COMPARISON_KERNEL(op_type, sign) \ #define COMPARISON_KERNEL(op_type, sign) \
__global__ void op_type(const half* in1, const half* in2, bool* out) { \ __global__ void op_type(const half *in1, const half *in2, bool *out) { \
out[0] = in1[0] sign in2[0]; \ out[0] = in1[0] sign in2[0]; \
} }
#ifdef PADDLE_WITH_HIP
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(out[0])), v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#define COMPOUND_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
hipMemcpy(in1, d_in1, size, hipMemcpyDeviceToHost); \
EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out); \
free(in1); \
free(in2); \
hipFree(d_in1); \
hipFree(d_in2); \
}
#define COMPARISON_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2; \
half *d_in1, *d_in2; \
bool *out, *d_out; \
int size = sizeof(half); \
hipMalloc(reinterpret_cast<void **>(&d_in1), size); \
hipMalloc(reinterpret_cast<void **>(&d_in2), size); \
hipMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \
hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \
hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \
hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
hipMemcpy(out, d_out, 1, hipMemcpyDeviceToHost); \
EXPECT_EQ(out[0], v_out); \
free(in1); \
free(in2); \
free(out); \
hipFree(d_in1); \
hipFree(d_in2); \
hipFree(d_out); \
}
#else
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \ #define ARITHMETIC_KERNEL_LAUNCH(op_type) \
void Test##op_type(float v_in1, float v_in2, float v_out) { \ void Test##op_type(float v_in1, float v_in2, float v_out) { \
LOG(INFO) << "Test " << #op_type << " on GPU!"; \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \
half *in1, *in2, *out; \ half *in1, *in2, *out; \
half *d_in1, *d_in2, *d_out; \ half *d_in1, *d_in2, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), size); \ cudaMalloc(reinterpret_cast<void **>(&d_out), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<half*>(malloc(size)); \ out = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -67,10 +146,10 @@ limitations under the License. */ ...@@ -67,10 +146,10 @@ limitations under the License. */
half *in1, *in2; \ half *in1, *in2; \
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -91,12 +170,12 @@ limitations under the License. */ ...@@ -91,12 +170,12 @@ limitations under the License. */
half *d_in1, *d_in2; \ half *d_in1, *d_in2; \
bool *out, *d_out; \ bool *out, *d_out; \
int size = sizeof(half); \ int size = sizeof(half); \
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in1), size); \
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \ cudaMalloc(reinterpret_cast<void **>(&d_in2), size); \
cudaMalloc(reinterpret_cast<void**>(&d_out), 1); \ cudaMalloc(reinterpret_cast<void **>(&d_out), 1); \
in1 = reinterpret_cast<half*>(malloc(size)); \ in1 = reinterpret_cast<half *>(malloc(size)); \
in2 = reinterpret_cast<half*>(malloc(size)); \ in2 = reinterpret_cast<half *>(malloc(size)); \
out = reinterpret_cast<bool*>(malloc(1)); \ out = reinterpret_cast<bool *>(malloc(1)); \
in1[0] = half(float16(v_in1)); \ in1[0] = half(float16(v_in1)); \
in2[0] = half(float16(v_in2)); \ in2[0] = half(float16(v_in2)); \
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
...@@ -111,12 +190,14 @@ limitations under the License. */ ...@@ -111,12 +190,14 @@ limitations under the License. */
cudaFree(d_in2); \ cudaFree(d_in2); \
cudaFree(d_out); \ cudaFree(d_out); \
} }
#endif
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if CUDA_VERSION < 9000 #if defined(PADDLE_WITH_HIP) || \
(defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Add, +)
ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Sub, -)
ARITHMETIC_KERNEL(Mul, *) ARITHMETIC_KERNEL(Mul, *)
...@@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul) ...@@ -128,21 +209,37 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
ARITHMETIC_KERNEL_LAUNCH(Div) ARITHMETIC_KERNEL_LAUNCH(Div)
// Negative sign kernel // Negative sign kernel
__global__ void Neg(half* in) { in[0] = -in[0]; } __global__ void Neg(half *in) { in[0] = -in[0]; }
void TestNeg(float v_in, float v_out) { void TestNeg(float v_in, float v_out) {
LOG(INFO) << "Test Neg on GPU!"; LOG(INFO) << "Test Neg on GPU!";
half *in, *d_in; half *in, *d_in;
int size = sizeof(half); int size = sizeof(half);
cudaMalloc(reinterpret_cast<void**>(&d_in), size); #ifdef PADDLE_WITH_HIP
in = reinterpret_cast<half*>(malloc(size)); hipMalloc(reinterpret_cast<void **>(&d_in), size);
#else
cudaMalloc(reinterpret_cast<void **>(&d_in), size);
#endif
in = reinterpret_cast<half *>(malloc(size));
in[0] = half(float16(v_in)); in[0] = half(float16(v_in));
#ifdef PADDLE_WITH_HIP
hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
#else
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
#endif
Neg<<<1, 1>>>(d_in); Neg<<<1, 1>>>(d_in);
#ifdef PADDLE_WITH_HIP
hipMemcpy(in, d_in, size, hipMemcpyDeviceToHost);
#else
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
#endif
EXPECT_EQ(static_cast<float>(float16(in[0])), v_out); EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
free(in); free(in);
#ifdef PADDLE_WITH_HIP
hipFree(d_in);
#else
cudaFree(d_in); cudaFree(d_in);
#endif
} }
COMPOUND_KERNEL(AddAssign, +=) COMPOUND_KERNEL(AddAssign, +=)
...@@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) { ...@@ -221,7 +318,7 @@ TEST(float16, lod_tensor_on_gpu) {
framework::LoDTensor gpu_tensor; framework::LoDTensor gpu_tensor;
framework::LoDTensor dst_tensor; framework::LoDTensor dst_tensor;
float16* src_ptr = src_tensor.mutable_data<float16>( float16 *src_ptr = src_tensor.mutable_data<float16>(
framework::make_ddim({2, 2}), CPUPlace()); framework::make_ddim({2, 2}), CPUPlace());
float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f), float16 arr[4] = {float16(1.0f), float16(0.5f), float16(0.33333f),
...@@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) { ...@@ -238,7 +335,7 @@ TEST(float16, lod_tensor_on_gpu) {
// Sync before comparing LoDTensors // Sync before comparing LoDTensors
gpu_ctx.Wait(); gpu_ctx.Wait();
const float16* dst_ptr = dst_tensor.data<float16>(); const float16 *dst_ptr = dst_tensor.data<float16>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
for (size_t i = 0; i < 4; ++i) { for (size_t i = 0; i < 4; ++i) {
EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x); EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
...@@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) { ...@@ -247,7 +344,7 @@ TEST(float16, lod_tensor_on_gpu) {
template <typename T> template <typename T>
struct Functor { struct Functor {
bool operator()(const T& val) { bool operator()(const T &val) {
return std::type_index(typeid(T)) == return std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16)); std::type_index(typeid(platform::float16));
} }
...@@ -304,13 +401,13 @@ TEST(float16, cast) { ...@@ -304,13 +401,13 @@ TEST(float16, cast) {
auto b = a; auto b = a;
{ {
// change semantic, keep the same value // change semantic, keep the same value
float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b)); float16 c = reinterpret_cast<float16 &>(reinterpret_cast<unsigned &>(b));
EXPECT_EQ(b, c); EXPECT_EQ(b, c);
} }
{ {
// use uint32 low 16 bit store float16 // use uint32 low 16 bit store float16
uint32_t c = reinterpret_cast<uint32_t&>(b); uint32_t c = reinterpret_cast<uint32_t &>(b);
float16 d; float16 d;
d.x = c; d.x = c;
EXPECT_EQ(b, d); EXPECT_EQ(b, d);
......
...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h"
#include <arpa/inet.h> #include <arpa/inet.h>
...@@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint, ...@@ -336,7 +337,7 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
template void RecvBroadCastCommID<Type>(std::string endpoint, \ template void RecvBroadCastCommID<Type>(std::string endpoint, \
std::vector<Type> * nccl_ids); std::vector<Type> * nccl_ids);
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
INSTANT_TEMPLATE(ncclUniqueId) INSTANT_TEMPLATE(ncclUniqueId)
#endif #endif
#ifdef PADDLE_WITH_XPU_BKCL #ifdef PADDLE_WITH_XPU_BKCL
......
...@@ -14,7 +14,8 @@ limitations under the License. */ ...@@ -14,7 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
#include <functional> #include <functional>
#include <string> #include <string>
#include <vector> #include <vector>
......
...@@ -17,7 +17,11 @@ limitations under the License. */ ...@@ -17,7 +17,11 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/miopen.h"
#else
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/lock_guard_ptr.h" #include "paddle/fluid/platform/lock_guard_ptr.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -40,19 +44,34 @@ namespace platform { ...@@ -40,19 +44,34 @@ namespace platform {
int CudnnVersion() { int CudnnVersion() {
if (!dynload::HasCUDNN()) return -1; if (!dynload::HasCUDNN()) return -1;
#ifdef PADDLE_WITH_HIP
size_t version_major, version_minor, version_patch;
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
&version_major, &version_minor, &version_patch));
return version_major * 100 + version_minor * 10 + version_patch;
#else
return dynload::cudnnGetVersion(); return dynload::cudnnGetVersion();
#endif
} }
static int GetCUDADeviceCountImpl() { static int GetCUDADeviceCountImpl() {
int driverVersion = 0; int driverVersion = 0;
#ifdef PADDLE_WITH_HIP
hipError_t status = hipDriverGetVersion(&driverVersion);
#else
cudaError_t status = cudaDriverGetVersion(&driverVersion); cudaError_t status = cudaDriverGetVersion(&driverVersion);
#endif
if (!(status == cudaSuccess && driverVersion != 0)) { if (!(status == gpuSuccess && driverVersion != 0)) {
// No GPU driver // No GPU driver
VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!"; VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
return 0; return 0;
} }
#ifdef PADDLE_WITH_HIP
const auto *cuda_visible_devices = std::getenv("HIP_VISIBLE_DEVICES");
#else
const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES"); const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
#endif
if (cuda_visible_devices != nullptr) { if (cuda_visible_devices != nullptr) {
std::string cuda_visible_devices_str(cuda_visible_devices); std::string cuda_visible_devices_str(cuda_visible_devices);
if (!cuda_visible_devices_str.empty()) { if (!cuda_visible_devices_str.empty()) {
...@@ -68,12 +87,17 @@ static int GetCUDADeviceCountImpl() { ...@@ -68,12 +87,17 @@ static int GetCUDADeviceCountImpl() {
if (std::all_of(cuda_visible_devices_str.begin(), if (std::all_of(cuda_visible_devices_str.begin(),
cuda_visible_devices_str.end(), cuda_visible_devices_str.end(),
[](char ch) { return ch == ' '; })) { [](char ch) { return ch == ' '; })) {
VLOG(2) << "CUDA_VISIBLE_DEVICES is set to be empty. No GPU detected."; VLOG(2) << "CUDA_VISIBLE_DEVICES or HIP_VISIBLE_DEVICES is set to be "
"empty. No GPU detected.";
return 0; return 0;
} }
} }
int count; int count;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDeviceCount(&count));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDeviceCount(&count));
#endif
return count; return count;
} }
...@@ -94,13 +118,24 @@ int GetCUDAComputeCapability(int id) { ...@@ -94,13 +118,24 @@ int GetCUDAComputeCapability(int id) {
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int major, minor; int major, minor;
#ifdef PADDLE_WITH_HIP
auto major_error_code = hipDeviceGetAttribute(
&major, hipDeviceAttributeComputeCapabilityMajor, id);
auto minor_error_code = hipDeviceGetAttribute(
&minor, hipDeviceAttributeComputeCapabilityMinor, id);
#else
auto major_error_code = auto major_error_code =
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id);
auto minor_error_code = auto minor_error_code =
cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id); cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, id);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code); PADDLE_ENFORCE_CUDA_SUCCESS(major_error_code);
PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code); PADDLE_ENFORCE_CUDA_SUCCESS(minor_error_code);
#ifdef PADDLE_WITH_HIP
return major * 100 + minor;
#else
return major * 10 + minor; return major * 10 + minor;
#endif
} }
dim3 GetGpuMaxGridDimSize(int id) { dim3 GetGpuMaxGridDimSize(int id) {
...@@ -111,15 +146,30 @@ dim3 GetGpuMaxGridDimSize(int id) { ...@@ -111,15 +146,30 @@ dim3 GetGpuMaxGridDimSize(int id) {
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
dim3 ret; dim3 ret;
int size; int size;
#ifdef PADDLE_WITH_HIP
auto error_code_x =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimX, id);
#else
auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x); PADDLE_ENFORCE_CUDA_SUCCESS(error_code_x);
ret.x = size; ret.x = size;
#ifdef PADDLE_WITH_HIP
auto error_code_y =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimY, id);
#else
auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id); auto error_code_y = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimY, id);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y); PADDLE_ENFORCE_CUDA_SUCCESS(error_code_y);
ret.y = size; ret.y = size;
#ifdef PADDLE_WITH_HIP
auto error_code_z =
hipDeviceGetAttribute(&size, hipDeviceAttributeMaxGridDimZ, id);
#else
auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id); auto error_code_z = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimZ, id);
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z); PADDLE_ENFORCE_CUDA_SUCCESS(error_code_z);
ret.z = size; ret.z = size;
return ret; return ret;
...@@ -132,7 +182,11 @@ int GetCUDARuntimeVersion(int id) { ...@@ -132,7 +182,11 @@ int GetCUDARuntimeVersion(int id) {
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int runtime_version = 0; int runtime_version = 0;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipRuntimeGetVersion(&runtime_version));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaRuntimeGetVersion(&runtime_version));
#endif
return runtime_version; return runtime_version;
} }
...@@ -143,12 +197,16 @@ int GetCUDADriverVersion(int id) { ...@@ -143,12 +197,16 @@ int GetCUDADriverVersion(int id) {
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int driver_version = 0; int driver_version = 0;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipDriverGetVersion(&driver_version));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaDriverGetVersion(&driver_version));
#endif
return driver_version; return driver_version;
} }
bool TensorCoreAvailable() { bool TensorCoreAvailable() {
#if CUDA_VERSION >= 9000 #if !defined(PADDLE_WITH_HIP) && CUDA_VERSION >= 9000
int device = GetCurrentDeviceId(); int device = GetCurrentDeviceId();
int driver_version = GetCUDAComputeCapability(device); int driver_version = GetCUDAComputeCapability(device);
return driver_version >= 70; return driver_version >= 70;
...@@ -164,8 +222,13 @@ int GetCUDAMultiProcessors(int id) { ...@@ -164,8 +222,13 @@ int GetCUDAMultiProcessors(int id) {
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int count; int count;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id));
#endif
return count; return count;
} }
...@@ -176,8 +239,13 @@ int GetCUDAMaxThreadsPerMultiProcessor(int id) { ...@@ -176,8 +239,13 @@ int GetCUDAMaxThreadsPerMultiProcessor(int id) {
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int count; int count;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceGetAttribute(
&count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute( PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); &count, cudaDevAttrMaxThreadsPerMultiProcessor, id));
#endif
return count; return count;
} }
...@@ -188,14 +256,23 @@ int GetCUDAMaxThreadsPerBlock(int id) { ...@@ -188,14 +256,23 @@ int GetCUDAMaxThreadsPerBlock(int id) {
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
int count; int count;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id));
#endif
return count; return count;
} }
int GetCurrentDeviceId() { int GetCurrentDeviceId() {
int device_id; int device_id;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetDevice(&device_id));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetDevice(&device_id));
#endif
return device_id; return device_id;
} }
...@@ -224,7 +301,11 @@ void SetDeviceId(int id) { ...@@ -224,7 +301,11 @@ void SetDeviceId(int id) {
"Device id must be less than GPU count, " "Device id must be less than GPU count, "
"but received id is: %d. GPU count is: %d.", "but received id is: %d. GPU count is: %d.",
id, GetCUDADeviceCount())); id, GetCUDADeviceCount()));
#ifdef PADDLE_WITH_HIP
PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id));
#else
PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id));
#endif
} }
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
...@@ -289,46 +370,91 @@ size_t GpuMaxChunkSize() { ...@@ -289,46 +370,91 @@ size_t GpuMaxChunkSize() {
return max_chunk_size; return max_chunk_size;
} }
#ifdef PADDLE_WITH_HIP
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum hipMemcpyKind kind, hipStream_t stream) {
PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpyAsync(dst, src, count, kind, stream));
}
#else
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) { enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(dst, src, count, kind, stream));
} }
#endif
#ifdef PADDLE_WITH_HIP
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum hipMemcpyKind kind) {
PADDLE_ENFORCE_CUDA_SUCCESS(hipMemcpy(dst, src, count, kind));
}
#else
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) { enum cudaMemcpyKind kind) {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(dst, src, count, kind));
} }
#endif
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream) { int src_device, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream)); cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
#endif
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) { int src_device, size_t count) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipMemcpyPeer(dst, dst_device, src, src_device, count));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemcpyPeer(dst, dst_device, src, src_device, count)); cudaMemcpyPeer(dst, dst_device, src, src_device, count));
#endif
} }
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) { void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipMemsetAsync(dst, value, count, stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(dst, value, count, stream));
#endif
} }
void GpuStreamSync(cudaStream_t stream) { void GpuStreamSync(gpuStream_t stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
} }
static void RaiseNonOutOfMemoryError(cudaError_t *status) { static void RaiseNonOutOfMemoryError(gpuError_t *status) {
#ifdef PADDLE_WITH_HIP
if (*status == hipErrorOutOfMemory) {
*status = hipSuccess;
}
#else
if (*status == cudaErrorMemoryAllocation) { if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess; *status = cudaSuccess;
} }
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(*status); PADDLE_ENFORCE_CUDA_SUCCESS(*status);
#ifdef PADDLE_WITH_HIP
*status = hipGetLastError();
if (*status == hipErrorOutOfMemory) {
*status = hipSuccess;
}
#else
*status = cudaGetLastError(); *status = cudaGetLastError();
if (*status == cudaErrorMemoryAllocation) { if (*status == cudaErrorMemoryAllocation) {
*status = cudaSuccess; *status = cudaSuccess;
} }
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(*status); PADDLE_ENFORCE_CUDA_SUCCESS(*status);
} }
...@@ -370,26 +496,38 @@ class RecordedCudaMallocHelper { ...@@ -370,26 +496,38 @@ class RecordedCudaMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag * or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear. * would be clear.
*/ */
cudaError_t Malloc(void **ptr, size_t size) { gpuError_t Malloc(void **ptr, size_t size) {
LockGuardPtr<std::mutex> lock(mtx_); LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) { if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
#ifdef PADDLE_WITH_HIP
return hipErrorOutOfMemory;
#else
return cudaErrorMemoryAllocation; return cudaErrorMemoryAllocation;
#endif
} }
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
#ifdef PADDLE_WITH_HIP
auto result = hipMalloc(ptr, size);
#else
auto result = cudaMalloc(ptr, size); auto result = cudaMalloc(ptr, size);
if (result == cudaSuccess) { #endif
if (result == gpuSuccess) {
if (NeedRecord()) { if (NeedRecord()) {
cur_size_ += size; cur_size_ += size;
} }
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
return cudaSuccess; return gpuSuccess;
} else { } else {
RaiseNonOutOfMemoryError(&result); RaiseNonOutOfMemoryError(&result);
// Non out of memory error would be raised inside // Non out of memory error would be raised inside
// RaiseNonOutOfMemoryError. Therefore, we can // RaiseNonOutOfMemoryError. Therefore, we can
// return cudaErrorMemoryAllocation directly here. // return cudaErrorMemoryAllocation directly here.
#ifdef PADDLE_WITH_HIP
return hipErrorOutOfMemory;
#else
return cudaErrorMemoryAllocation; return cudaErrorMemoryAllocation;
#endif
} }
} }
...@@ -404,8 +542,13 @@ class RecordedCudaMallocHelper { ...@@ -404,8 +542,13 @@ class RecordedCudaMallocHelper {
// process is terminating, in which case we don't care if // process is terminating, in which case we don't care if
// cudaFree succeeds. // cudaFree succeeds.
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
#ifdef PADDLE_WITH_HIP
auto err = hipFree(ptr);
if (err != hipErrorDeinitialized) {
#else
auto err = cudaFree(ptr); auto err = cudaFree(ptr);
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
#endif
PADDLE_ENFORCE_CUDA_SUCCESS(err); PADDLE_ENFORCE_CUDA_SUCCESS(err);
if (NeedRecord()) { if (NeedRecord()) {
std::lock_guard<std::mutex> guard(*mtx_); std::lock_guard<std::mutex> guard(*mtx_);
...@@ -413,7 +556,11 @@ class RecordedCudaMallocHelper { ...@@ -413,7 +556,11 @@ class RecordedCudaMallocHelper {
} }
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
} else { } else {
#ifdef PADDLE_WITH_HIP
hipGetLastError(); // clear the error flag when hipErrorDeinitialized
#else
cudaGetLastError(); // clear the error flag when cudaErrorCudartUnloading cudaGetLastError(); // clear the error flag when cudaErrorCudartUnloading
#endif
} }
} }
...@@ -421,8 +568,12 @@ class RecordedCudaMallocHelper { ...@@ -421,8 +568,12 @@ class RecordedCudaMallocHelper {
size_t *actual_total) { size_t *actual_total) {
{ {
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
#ifdef PADDLE_WITH_HIP
auto result = hipMemGetInfo(actual_avail, actual_total);
#else
auto result = cudaMemGetInfo(actual_avail, actual_total); auto result = cudaMemGetInfo(actual_avail, actual_total);
if (result != cudaSuccess) { #endif
if (result != gpuSuccess) {
*actual_avail = 0; *actual_avail = 0;
} }
RaiseNonOutOfMemoryError(&result); RaiseNonOutOfMemoryError(&result);
...@@ -458,13 +609,13 @@ class RecordedCudaMallocHelper { ...@@ -458,13 +609,13 @@ class RecordedCudaMallocHelper {
static std::once_flag once_flag_; static std::once_flag once_flag_;
static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_; static std::vector<std::unique_ptr<RecordedCudaMallocHelper>> instances_;
}; }; // NOLINT
std::once_flag RecordedCudaMallocHelper::once_flag_; std::once_flag RecordedCudaMallocHelper::once_flag_;
std::vector<std::unique_ptr<RecordedCudaMallocHelper>> std::vector<std::unique_ptr<RecordedCudaMallocHelper>>
RecordedCudaMallocHelper::instances_; RecordedCudaMallocHelper::instances_;
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) { gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id) {
return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size); return RecordedCudaMallocHelper::Instance(dev_id)->Malloc(ptr, size);
} }
......
...@@ -15,11 +15,19 @@ limitations under the License. */ ...@@ -15,11 +15,19 @@ limitations under the License. */
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Note: this header for simplify HIP and CUDA type string
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/platform/type_defs.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -86,28 +94,36 @@ size_t GpuMaxChunkSize(); ...@@ -86,28 +94,36 @@ size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously. //! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind, hipStream_t stream);
#else
enum cudaMemcpyKind kind, cudaStream_t stream); enum cudaMemcpyKind kind, cudaStream_t stream);
#endif
//! Copy memory from address src to dst synchronously. //! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst, const void *src, size_t count,
#ifdef PADDLE_WITH_HIP
enum hipMemcpyKind kind);
#else
enum cudaMemcpyKind kind); enum cudaMemcpyKind kind);
#endif
//! Copy memory from one device to another device asynchronously. //! Copy memory from one device to another device asynchronously.
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, cudaStream_t stream); int src_device, size_t count, gpuStream_t stream);
//! Copy memory from one device to another device synchronously. //! Copy memory from one device to another device synchronously.
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count); int src_device, size_t count);
//! Set memory dst with value count size asynchronously //! Set memory dst with value count size asynchronously
void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream); void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream);
//! Blocks until stream has completed all operations. //! Blocks until stream has completed all operations.
void GpuStreamSync(cudaStream_t stream); void GpuStreamSync(gpuStream_t stream);
//! CudaMalloc with recorded info //! CudaMalloc with recorded info
cudaError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id); gpuError_t RecordedCudaMalloc(void **ptr, size_t size, int dev_id);
//! CudaFree with recorded info //! CudaFree with recorded info
void RecordedCudaFree(void *p, size_t size, int dev_id); void RecordedCudaFree(void *p, size_t size, int dev_id);
......
...@@ -16,9 +16,13 @@ ...@@ -16,9 +16,13 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#else
#include <hip/hip_runtime.h>
#endif
#include <stddef.h> #include <stddef.h>
#include <algorithm> #include <algorithm>
#include <string> #include <string>
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include <stdio.h> #include <stdio.h>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -25,7 +25,12 @@ ...@@ -25,7 +25,12 @@
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
...@@ -81,7 +86,7 @@ struct NCCLContext { ...@@ -81,7 +86,7 @@ struct NCCLContext {
explicit NCCLContext(int dev_id) explicit NCCLContext(int dev_id)
: ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
cudaStream_t stream() const { return ctx_->stream(); } gpuStream_t stream() const { return ctx_->stream(); }
ncclComm_t comm() const { return comm_; } ncclComm_t comm() const { return comm_; }
int device_id() const { int device_id() const {
......
...@@ -154,7 +154,7 @@ struct PlaceVisitorWrapper ...@@ -154,7 +154,7 @@ struct PlaceVisitorWrapper
} }
typename Visitor::result_type operator()(const CUDAPlace &cuda) const { typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda); return visitor_(cuda);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
...@@ -165,7 +165,7 @@ struct PlaceVisitorWrapper ...@@ -165,7 +165,7 @@ struct PlaceVisitorWrapper
typename Visitor::result_type operator()( typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const { const CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return visitor_(cuda_pinned); return visitor_(cuda_pinned);
#else #else
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
......
...@@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -206,7 +206,7 @@ void EnableProfiler(ProfilerState state) {
g_state = state; g_state = state;
should_send_profile_state = true; should_send_profile_state = true;
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll || if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) { g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
......
...@@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
...@@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) { ...@@ -31,6 +38,21 @@ static void ForEachDevice(std::function<void(int)> func) {
} }
void DummyKernelAndEvent() { void DummyKernelAndEvent() {
#ifdef PADDLE_WITH_HIP
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
platform::SetDeviceId(d);
hipStream_t stream;
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE_CUDA_SUCCESS(hipMalloc(&ptr, sizeof(int)));
hipLaunchKernelGGL(DummyKernel, dim3(1), dim3(1), 0, stream, ptr);
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
PADDLE_ENFORCE_CUDA_SUCCESS(hipFree(ptr));
});
}
#else
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) { ForEachDevice([](int d) {
platform::SetDeviceId(d); platform::SetDeviceId(d);
...@@ -44,6 +66,7 @@ void DummyKernelAndEvent() { ...@@ -44,6 +66,7 @@ void DummyKernelAndEvent() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
}); });
} }
#endif
} }
} // namespace platform } // namespace platform
......
...@@ -28,7 +28,7 @@ limitations under the License. */ ...@@ -28,7 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map, ...@@ -220,7 +220,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
const std::string& type_name); const std::string& type_name);
void SetTracerOption(TracerOption option); void SetTracerOption(TracerOption option);
platform::TracerOption GetTracerOption(); platform::TracerOption GetTracerOption();
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void DummyKernelAndEvent(); void DummyKernelAndEvent();
#endif #endif
......
...@@ -31,6 +31,9 @@ limitations under the License. */ ...@@ -31,6 +31,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -122,6 +125,13 @@ void SynchronizeAllDevice() { ...@@ -122,6 +125,13 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
} }
#endif #endif
#ifdef PADDLE_WITH_HIP
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
}
#endif
} }
// Print results // Print results
...@@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event, ...@@ -300,7 +310,7 @@ void SetEvent(bool merge_thread, const Event &analyze_event,
if (rit != pushed_events->rend()) { if (rit != pushed_events->rend()) {
double event_time = 0; double event_time = 0;
double gpu_time = 0.0f; double gpu_time = 0.0f;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpu_time = rit->CudaElapsedMs(analyze_event); gpu_time = rit->CudaElapsedMs(analyze_event);
#endif #endif
double cpu_time = rit->CpuElapsedMs(analyze_event); double cpu_time = rit->CpuElapsedMs(analyze_event);
......
...@@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -122,7 +122,7 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else #else
EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
...@@ -146,3 +146,13 @@ TEST(TMP, stream_wait) { ...@@ -146,3 +146,13 @@ TEST(TMP, stream_wait) {
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
#endif #endif
#ifdef PADDLE_WITH_HIP
TEST(TMP, stream_wait) {
hipStream_t stream;
hipStreamCreate(&stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
hipStreamSynchronize(stream);
}
#endif
...@@ -18,7 +18,10 @@ ...@@ -18,7 +18,10 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#if CUDA_VERSION >= 10000 #ifdef PADDLE_WITH_HIP
static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
void *user_data)
#elif CUDA_VERSION >= 10000
static void CUDART_CB StreamCallbackFunc(void *user_data) static void CUDART_CB StreamCallbackFunc(void *user_data)
#else #else
static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
...@@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, ...@@ -30,7 +33,7 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
(*func)(); (*func)();
} }
StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
: stream_(stream), thread_pool_(1) {} : stream_(stream), thread_pool_(1) {}
void StreamCallbackManager::AddCallback(std::function<void()> callback) const { void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
...@@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const { ...@@ -42,7 +45,10 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
(*callback_func)(); (*callback_func)();
}); });
}); });
#if CUDA_VERSION >= 10000 #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#elif CUDA_VERSION >= 10000
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
#else #else
...@@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const { ...@@ -52,7 +58,11 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
} }
void StreamCallbackManager::Wait() const { void StreamCallbackManager::Wait() const {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
#endif
{ {
std::lock_guard<std::mutex> lock(mtx_); std::lock_guard<std::mutex> lock(mtx_);
if (last_future_.valid()) { if (last_future_.valid()) {
......
...@@ -15,8 +15,16 @@ ...@@ -15,8 +15,16 @@
#pragma once #pragma once
#include <ThreadPool.h> #include <ThreadPool.h>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <functional> #include <functional>
#include <future> // NOLINT #include <future> // NOLINT
#include <memory> #include <memory>
...@@ -31,7 +39,7 @@ namespace platform { ...@@ -31,7 +39,7 @@ namespace platform {
// Make StreamCallbackManager thread-safe // Make StreamCallbackManager thread-safe
class StreamCallbackManager { class StreamCallbackManager {
public: public:
explicit StreamCallbackManager(const cudaStream_t stream); explicit StreamCallbackManager(const gpuStream_t stream);
~StreamCallbackManager() = default; ~StreamCallbackManager() = default;
...@@ -40,7 +48,7 @@ class StreamCallbackManager { ...@@ -40,7 +48,7 @@ class StreamCallbackManager {
void Wait() const; void Wait() const;
private: private:
const cudaStream_t stream_; const gpuStream_t stream_;
mutable ::ThreadPool thread_pool_; mutable ::ThreadPool thread_pool_;
mutable std::mutex mtx_; mutable std::mutex mtx_;
mutable std::future<void> last_future_; mutable std::future<void> last_future_;
......
...@@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) { ...@@ -40,24 +40,36 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total, RecordedCudaMemGetInfo(&avail, &total, &actual_avail, &actual_total,
DEVICE_ID); DEVICE_ID);
ASSERT_EQ(total, limit); ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
} }
{ {
CUDADeviceGuard guard(DEVICE_ID); CUDADeviceGuard guard(DEVICE_ID);
GpuMemoryUsage(&avail, &total); GpuMemoryUsage(&avail, &total);
ASSERT_EQ(total, limit); ASSERT_EQ(total, limit);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
} }
cudaError_t err = cudaSuccess; gpuError_t err = gpuSuccess;
void *p1 = nullptr; void *p1 = nullptr;
size_t size1 = limit / 4 * 3; size_t size1 = limit / 4 * 3;
{ {
err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID); err = platform::RecordedCudaMalloc(&p1, size1, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess); ASSERT_EQ(err, gpuSuccess);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); #ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_NE(p1, nullptr); ASSERT_NE(p1, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
...@@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) { ...@@ -67,8 +79,13 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
size_t size2 = limit / 2; size_t size2 = limit / 2;
{ {
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(err, hipErrorOutOfMemory);
ASSERT_EQ(hipGetLastError(), gpuSuccess);
#else
ASSERT_EQ(err, cudaErrorMemoryAllocation); ASSERT_EQ(err, cudaErrorMemoryAllocation);
ASSERT_EQ(cudaGetLastError(), cudaSuccess); ASSERT_EQ(cudaGetLastError(), gpuSuccess);
#endif
ASSERT_EQ(p2, nullptr); ASSERT_EQ(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size1);
...@@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) { ...@@ -81,8 +98,12 @@ TEST(test_record_malloc, test_limit_gpu_memory) {
{ {
err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID); err = platform::RecordedCudaMalloc(&p2, size2, DEVICE_ID);
ASSERT_EQ(err, cudaSuccess); ASSERT_EQ(err, gpuSuccess);
#ifdef PADDLE_WITH_HIP
ASSERT_EQ(hipGetLastError(), hipSuccess);
#else
ASSERT_EQ(cudaGetLastError(), cudaSuccess); ASSERT_EQ(cudaGetLastError(), cudaSuccess);
#endif
ASSERT_NE(p2, nullptr); ASSERT_NE(p2, nullptr);
ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2); ASSERT_EQ(RecordedCudaMallocSize(DEVICE_ID), size2);
} }
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h" #include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
...@@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> { ...@@ -76,7 +76,7 @@ struct Transform<platform::CPUDeviceContext> {
} }
}; };
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIPCC__)
template <> template <>
struct Transform<platform::CUDADeviceContext> { struct Transform<platform::CUDADeviceContext> {
template <typename InputIter, typename OutputIter, typename UnaryOperation> template <typename InputIter, typename OutputIter, typename UnaryOperation>
...@@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> { ...@@ -86,10 +86,17 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place.")); "The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first), details::CastToCUDATransformIterator(first),
details::CastToCUDATransformIterator(last), details::CastToCUDATransformIterator(last),
details::CastToCUDATransformIterator(result), op); details::CastToCUDATransformIterator(result), op);
#endif
} }
template <typename InputIter1, typename InputIter2, typename OutputIter, template <typename InputIter1, typename InputIter2, typename OutputIter,
...@@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> { ...@@ -101,11 +108,19 @@ struct Transform<platform::CUDADeviceContext> {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The CUDA Transform must be used in GPU place.")); "The CUDA Transform must be used in GPU place."));
#ifdef __HIPCC__
thrust::transform(thrust::hip::par.on(context.stream()),
details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op);
#else
thrust::transform(thrust::cuda::par.on(context.stream()), thrust::transform(thrust::cuda::par.on(context.stream()),
details::CastToCUDATransformIterator(first1), details::CastToCUDATransformIterator(first1),
details::CastToCUDATransformIterator(last1), details::CastToCUDATransformIterator(last1),
details::CastToCUDATransformIterator(first2), details::CastToCUDATransformIterator(first2),
details::CastToCUDATransformIterator(result), op); details::CastToCUDATransformIterator(result), op);
#endif
} }
}; };
#endif #endif
......
...@@ -32,7 +32,7 @@ limitations under the License. */ ...@@ -32,7 +32,7 @@ limitations under the License. */
// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same // BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
// function symbols. For details, // function symbols. For details,
// https://github.com/PaddlePaddle/Paddle/issues/3386 // https://github.com/PaddlePaddle/Paddle/issues/3386
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
#define BOOST_NO_CXX11_VARIADIC_TEMPLATES #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册