未验证 提交 f89da4ab 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid platform for rocm35 (part1), test=develop (#30639)

* [ROCM] update fluid platform for rocm35 (part1), test=develop

* address review comments, test=develop
上级 fc002405
...@@ -47,7 +47,17 @@ struct PADDLE_ALIGN(2) bfloat16 { ...@@ -47,7 +47,17 @@ struct PADDLE_ALIGN(2) bfloat16 {
~bfloat16() = default; ~bfloat16() = default;
HOSTDEVICE inline explicit bfloat16(float val) { HOSTDEVICE inline explicit bfloat16(float val) {
#ifdef PADDLE_WITH_HIP
uint32_t res = 0;
uint32_t* tempRes;
// We should be using memcpy in order to respect the strict aliasing rule
// but it fails in the HIP environment.
tempRes = reinterpret_cast<uint32_t*>(&val);
res = *tempRes;
x = res >> 16;
#else
std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2); std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
#endif
} }
template <class T> template <class T>
......
...@@ -28,6 +28,11 @@ ...@@ -28,6 +28,11 @@
#include <thrust/complex.h> #include <thrust/complex.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_complex.h>
#include <thrust/complex.h> // NOLINT
#endif
#include <cstring> #include <cstring>
#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/hostdevice.h"
...@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(16) complex128 { ...@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(16) complex128 {
~complex128() = default; ~complex128() = default;
HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {} HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {}
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) { HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) {
real = c.real(); real = c.real();
...@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(16) complex128 { ...@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(16) complex128 {
return thrust::complex<double>(real, imag); return thrust::complex<double>(real, imag);
} }
#ifdef PADDLE_WITH_HIP
HOSTDEVICE inline explicit operator hipDoubleComplex() const {
return make_hipDoubleComplex(real, imag);
}
#else
HOSTDEVICE inline explicit operator cuDoubleComplex() const { HOSTDEVICE inline explicit operator cuDoubleComplex() const {
return make_cuDoubleComplex(real, imag); return make_cuDoubleComplex(real, imag);
} }
#endif
#endif #endif
HOSTDEVICE complex128(const float& val) HOSTDEVICE complex128(const float& val)
...@@ -202,7 +213,7 @@ struct PADDLE_ALIGN(16) complex128 { ...@@ -202,7 +213,7 @@ struct PADDLE_ALIGN(16) complex128 {
HOSTDEVICE inline complex128 operator+(const complex128& a, HOSTDEVICE inline complex128 operator+(const complex128& a,
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::complex<double>(a.real, a.imag) + return complex128(thrust::complex<double>(a.real, a.imag) +
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
#else #else
...@@ -212,7 +223,7 @@ HOSTDEVICE inline complex128 operator+(const complex128& a, ...@@ -212,7 +223,7 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,
HOSTDEVICE inline complex128 operator-(const complex128& a, HOSTDEVICE inline complex128 operator-(const complex128& a,
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::complex<double>(a.real, a.imag) - return complex128(thrust::complex<double>(a.real, a.imag) -
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
#else #else
...@@ -222,7 +233,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a, ...@@ -222,7 +233,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,
HOSTDEVICE inline complex128 operator*(const complex128& a, HOSTDEVICE inline complex128 operator*(const complex128& a,
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::complex<double>(a.real, a.imag) * return complex128(thrust::complex<double>(a.real, a.imag) *
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
#else #else
...@@ -233,7 +244,7 @@ HOSTDEVICE inline complex128 operator*(const complex128& a, ...@@ -233,7 +244,7 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,
HOSTDEVICE inline complex128 operator/(const complex128& a, HOSTDEVICE inline complex128 operator/(const complex128& a,
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::complex<double>(a.real, a.imag) / return complex128(thrust::complex<double>(a.real, a.imag) /
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
#else #else
...@@ -244,7 +255,7 @@ HOSTDEVICE inline complex128 operator/(const complex128& a, ...@@ -244,7 +255,7 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
} }
HOSTDEVICE inline complex128 operator-(const complex128& a) { HOSTDEVICE inline complex128 operator-(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(-thrust::complex<double>(a.real, a.imag)); return complex128(-thrust::complex<double>(a.real, a.imag));
#else #else
complex128 res; complex128 res;
...@@ -256,7 +267,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) { ...@@ -256,7 +267,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {
HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex128(thrust::complex<double>(a.real, a.imag) += a = complex128(thrust::complex<double>(a.real, a.imag) +=
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
return a; return a;
...@@ -269,7 +280,7 @@ HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT ...@@ -269,7 +280,7 @@ HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT
HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex128(thrust::complex<double>(a.real, a.imag) -= a = complex128(thrust::complex<double>(a.real, a.imag) -=
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
return a; return a;
...@@ -282,7 +293,7 @@ HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT ...@@ -282,7 +293,7 @@ HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT
HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex128(thrust::complex<double>(a.real, a.imag) *= a = complex128(thrust::complex<double>(a.real, a.imag) *=
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
return a; return a;
...@@ -295,7 +306,7 @@ HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT ...@@ -295,7 +306,7 @@ HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT
HOSTDEVICE inline complex128& operator/=(complex128& a, // NOLINT HOSTDEVICE inline complex128& operator/=(complex128& a, // NOLINT
const complex128& b) { const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex128(thrust::complex<double>(a.real, a.imag) /= a = complex128(thrust::complex<double>(a.real, a.imag) /=
thrust::complex<double>(b.real, b.imag)); thrust::complex<double>(b.real, b.imag));
return a; return a;
...@@ -339,6 +350,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) { ...@@ -339,6 +350,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
HOSTDEVICE inline bool(isnan)(const complex128& a) { HOSTDEVICE inline bool(isnan)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
// __isnanf not supported on HIP platform
return __isnan(a.real) || __isnan(a.imag); return __isnan(a.real) || __isnan(a.imag);
#else #else
return std::isnan(a.real) || std::isnan(a.imag); return std::isnan(a.real) || std::isnan(a.imag);
...@@ -347,6 +359,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) { ...@@ -347,6 +359,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {
HOSTDEVICE inline bool(isinf)(const complex128& a) { HOSTDEVICE inline bool(isinf)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
// __isinf not supported on HIP platform
return __isinf(a.real) || __isinf(a.imag); return __isinf(a.real) || __isinf(a.imag);
#else #else
return std::isinf(a.real) || std::isinf(a.imag); return std::isinf(a.real) || std::isinf(a.imag);
...@@ -358,7 +371,7 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) { ...@@ -358,7 +371,7 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
} }
HOSTDEVICE inline double(abs)(const complex128& a) { HOSTDEVICE inline double(abs)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return thrust::abs(thrust::complex<double>(a.real, a.imag)); return thrust::abs(thrust::complex<double>(a.real, a.imag));
#else #else
return std::abs(std::complex<double>(a.real, a.imag)); return std::abs(std::complex<double>(a.real, a.imag));
...@@ -366,7 +379,7 @@ HOSTDEVICE inline double(abs)(const complex128& a) { ...@@ -366,7 +379,7 @@ HOSTDEVICE inline double(abs)(const complex128& a) {
} }
HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) { HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag), return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
thrust::complex<double>(b.real, b.imag))); thrust::complex<double>(b.real, b.imag)));
#else #else
...@@ -375,7 +388,7 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) { ...@@ -375,7 +388,7 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
} }
HOSTDEVICE inline complex128(sqrt)(const complex128& a) { HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag))); return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
#else #else
return std::sqrt(std::complex<double>(a)); return std::sqrt(std::complex<double>(a));
...@@ -383,7 +396,7 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) { ...@@ -383,7 +396,7 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
} }
HOSTDEVICE inline complex128(tanh)(const complex128& a) { HOSTDEVICE inline complex128(tanh)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag))); return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
#else #else
return std::tanh(std::complex<double>(a)); return std::tanh(std::complex<double>(a));
...@@ -391,7 +404,7 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) { ...@@ -391,7 +404,7 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) {
} }
HOSTDEVICE inline complex128(log)(const complex128& a) { HOSTDEVICE inline complex128(log)(const complex128& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex128(thrust::log(thrust::complex<double>(a.real, a.imag))); return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
#else #else
return complex128(std::log(std::complex<double>(a))); return complex128(std::log(std::complex<double>(a)));
......
...@@ -27,6 +27,11 @@ ...@@ -27,6 +27,11 @@
#include <thrust/complex.h> #include <thrust/complex.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hip/hip_complex.h>
#include <thrust/complex.h> // NOLINT
#endif
#include <cstring> #include <cstring>
#include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex128.h"
...@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(8) complex64 { ...@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(8) complex64 {
~complex64() = default; ~complex64() = default;
HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {} HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {}
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
HOSTDEVICE inline explicit complex64(const thrust::complex<float>& c) { HOSTDEVICE inline explicit complex64(const thrust::complex<float>& c) {
real = c.real(); real = c.real();
...@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(8) complex64 { ...@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(8) complex64 {
return thrust::complex<float>(real, imag); return thrust::complex<float>(real, imag);
} }
#ifdef PADDLE_WITH_HIP
HOSTDEVICE inline explicit operator hipFloatComplex() const {
return make_hipFloatComplex(real, imag);
}
#else
HOSTDEVICE inline explicit operator cuFloatComplex() const { HOSTDEVICE inline explicit operator cuFloatComplex() const {
return make_cuFloatComplex(real, imag); return make_cuFloatComplex(real, imag);
} }
#endif
#endif #endif
HOSTDEVICE complex64(const float& val) : real(val), imag(0) {} HOSTDEVICE complex64(const float& val) : real(val), imag(0) {}
...@@ -207,7 +218,7 @@ struct PADDLE_ALIGN(8) complex64 { ...@@ -207,7 +218,7 @@ struct PADDLE_ALIGN(8) complex64 {
}; };
HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) { HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::complex<float>(a.real, a.imag) + return complex64(thrust::complex<float>(a.real, a.imag) +
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
#else #else
...@@ -216,7 +227,7 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) { ...@@ -216,7 +227,7 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
} }
HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) { HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::complex<float>(a.real, a.imag) - return complex64(thrust::complex<float>(a.real, a.imag) -
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
#else #else
...@@ -225,7 +236,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) { ...@@ -225,7 +236,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
} }
HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) { HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::complex<float>(a.real, a.imag) * return complex64(thrust::complex<float>(a.real, a.imag) *
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
#else #else
...@@ -235,7 +246,7 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) { ...@@ -235,7 +246,7 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
} }
HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) { HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::complex<float>(a.real, a.imag) / return complex64(thrust::complex<float>(a.real, a.imag) /
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
#else #else
...@@ -246,7 +257,7 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) { ...@@ -246,7 +257,7 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
} }
HOSTDEVICE inline complex64 operator-(const complex64& a) { HOSTDEVICE inline complex64 operator-(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(-thrust::complex<float>(a.real, a.imag)); return complex64(-thrust::complex<float>(a.real, a.imag));
#else #else
complex64 res; complex64 res;
...@@ -258,7 +269,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) { ...@@ -258,7 +269,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {
HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT
const complex64& b) { const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex64(thrust::complex<float>(a.real, a.imag) += a = complex64(thrust::complex<float>(a.real, a.imag) +=
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
return a; return a;
...@@ -271,7 +282,7 @@ HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT ...@@ -271,7 +282,7 @@ HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT
HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT
const complex64& b) { const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex64(thrust::complex<float>(a.real, a.imag) -= a = complex64(thrust::complex<float>(a.real, a.imag) -=
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
return a; return a;
...@@ -284,7 +295,7 @@ HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT ...@@ -284,7 +295,7 @@ HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT
HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT
const complex64& b) { const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex64(thrust::complex<float>(a.real, a.imag) *= a = complex64(thrust::complex<float>(a.real, a.imag) *=
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
return a; return a;
...@@ -297,7 +308,7 @@ HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT ...@@ -297,7 +308,7 @@ HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT
HOSTDEVICE inline complex64& operator/=(complex64& a, // NOLINT HOSTDEVICE inline complex64& operator/=(complex64& a, // NOLINT
const complex64& b) { const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
a = complex64(thrust::complex<float>(a.real, a.imag) /= a = complex64(thrust::complex<float>(a.real, a.imag) /=
thrust::complex<float>(b.real, b.imag)); thrust::complex<float>(b.real, b.imag));
return a; return a;
...@@ -341,6 +352,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) { ...@@ -341,6 +352,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
HOSTDEVICE inline bool(isnan)(const complex64& a) { HOSTDEVICE inline bool(isnan)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
// __isnanf not supported on HIP platform
return __isnanf(a.real) || __isnanf(a.imag); return __isnanf(a.real) || __isnanf(a.imag);
#else #else
return std::isnan(a.real) || std::isnan(a.imag); return std::isnan(a.real) || std::isnan(a.imag);
...@@ -349,6 +361,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) { ...@@ -349,6 +361,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {
HOSTDEVICE inline bool(isinf)(const complex64& a) { HOSTDEVICE inline bool(isinf)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
// __isinff not supported on HIP platform
return __isinff(a.real) || __isinff(a.imag); return __isinff(a.real) || __isinff(a.imag);
#else #else
return std::isinf(a.real) || std::isinf(a.imag); return std::isinf(a.real) || std::isinf(a.imag);
...@@ -360,7 +373,7 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) { ...@@ -360,7 +373,7 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
} }
HOSTDEVICE inline float(abs)(const complex64& a) { HOSTDEVICE inline float(abs)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag))); return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
#else #else
return std::abs(std::complex<float>(a.real, a.imag)); return std::abs(std::complex<float>(a.real, a.imag));
...@@ -368,7 +381,7 @@ HOSTDEVICE inline float(abs)(const complex64& a) { ...@@ -368,7 +381,7 @@ HOSTDEVICE inline float(abs)(const complex64& a) {
} }
HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) { HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag), return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
thrust::complex<float>(b.real, b.imag))); thrust::complex<float>(b.real, b.imag)));
#else #else
...@@ -377,7 +390,7 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) { ...@@ -377,7 +390,7 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
} }
HOSTDEVICE inline complex64(sqrt)(const complex64& a) { HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag))); return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
#else #else
return std::sqrt(std::complex<float>(a)); return std::sqrt(std::complex<float>(a));
...@@ -385,7 +398,7 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) { ...@@ -385,7 +398,7 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
} }
HOSTDEVICE inline complex64(tanh)(const complex64& a) { HOSTDEVICE inline complex64(tanh)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag))); return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
#else #else
return std::tanh(std::complex<float>(a)); return std::tanh(std::complex<float>(a));
...@@ -393,7 +406,7 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) { ...@@ -393,7 +406,7 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) {
} }
HOSTDEVICE inline complex64(log)(const complex64& a) { HOSTDEVICE inline complex64(log)(const complex64& a) {
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__) || defined(__HIPCC__)
return complex64(thrust::log(thrust::complex<float>(a.real, a.imag))); return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
#else #else
return std::log(std::complex<float>(a)); return std::log(std::complex<float>(a));
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#ifndef __NVCC__ #if !defined(__NVCC__) && !defined(__HIPCC__)
#error device_ptr_cast must be include by .cu file #error device_ptr_cast must be include by .cu file
#endif #endif
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
#hip
if (WITH_ROCM_PLATFORM) if (WITH_ROCM)
list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
endif() endif()
# There is no macOS version of NCCL. # There is no macOS version of NCCL.
...@@ -13,7 +13,7 @@ if (NOT APPLE AND NOT WIN32) ...@@ -13,7 +13,7 @@ if (NOT APPLE AND NOT WIN32)
if (WITH_NCCL) if (WITH_NCCL)
list(APPEND CUDA_SRCS nccl.cc) list(APPEND CUDA_SRCS nccl.cc)
endif() endif()
if (WITH_ROCM_PLATFORM) if (WITH_ROCM)
list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc) list(APPEND HIP_SRCS hiprtc.cc rocm_driver.cc)
if (WITH_RCCL) if (WITH_RCCL)
list(APPEND HIP_SRCS rccl.cc) list(APPEND HIP_SRCS rccl.cc)
...@@ -29,9 +29,9 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) ...@@ -29,9 +29,9 @@ configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
if (CUPTI_FOUND) if (CUPTI_FOUND)
list(APPEND CUDA_SRCS cupti.cc) list(APPEND CUDA_SRCS cupti.cc)
endif(CUPTI_FOUND) endif(CUPTI_FOUND)
if(WITH_ROCM_PLATFORM) if(WITH_ROCM)
hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
hip_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
else() else()
nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
......
...@@ -55,7 +55,7 @@ DEFINE_string(miopen_dir, "", ...@@ -55,7 +55,7 @@ DEFINE_string(miopen_dir, "",
DEFINE_string(rocm_dir, "", DEFINE_string(rocm_dir, "",
"Specify path for loading rocm library, such as librocblas, " "Specify path for loading rocm library, such as librocblas, "
"libcurand, libcusolver. For instance, /opt/rocm/lib. " "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
"If default, dlopen will search rocm from LD_LIBRARY_PATH"); "If default, dlopen will search rocm from LD_LIBRARY_PATH");
DEFINE_string(rccl_dir, "", DEFINE_string(rccl_dir, "",
...@@ -264,7 +264,7 @@ void* GetCublasDsoHandle() { ...@@ -264,7 +264,7 @@ void* GetCublasDsoHandle() {
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true, return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
{cuda_lib_path}); {cuda_lib_path});
#elif PADDLE_WITH_HIP #elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so"); return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
...@@ -292,7 +292,7 @@ void* GetCUDNNDsoHandle() { ...@@ -292,7 +292,7 @@ void* GetCUDNNDsoHandle() {
"CUDNN version."); "CUDNN version.");
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true, return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
{cuda_lib_path}, win_warn_meg); {cuda_lib_path}, win_warn_meg);
#elif PADDLE_WITH_HIP #elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false); return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false, return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
...@@ -316,7 +316,7 @@ void* GetCurandDsoHandle() { ...@@ -316,7 +316,7 @@ void* GetCurandDsoHandle() {
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true, return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
{cuda_lib_path}); {cuda_lib_path});
#elif PADDLE_WITH_HIP #elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so"); return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so"); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
...@@ -337,8 +337,8 @@ void* GetCusolverDsoHandle() { ...@@ -337,8 +337,8 @@ void* GetCusolverDsoHandle() {
void* GetNVRTCDsoHandle() { void* GetNVRTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
#elif PADDLE_WITH_HIP #elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so"); return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprtc.so", false);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
#endif #endif
...@@ -347,8 +347,8 @@ void* GetNVRTCDsoHandle() { ...@@ -347,8 +347,8 @@ void* GetNVRTCDsoHandle() {
void* GetCUDADsoHandle() { void* GetCUDADsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
#elif PADDLE_WITH_HIP #elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so"); return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhip_hcc.so", false);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
#endif #endif
...@@ -369,15 +369,24 @@ void* GetWarpCTCDsoHandle() { ...@@ -369,15 +369,24 @@ void* GetWarpCTCDsoHandle() {
} }
void* GetNCCLDsoHandle() { void* GetNCCLDsoHandle() {
#ifdef PADDLE_WITH_HIP
std::string warning_msg(
"You may need to install 'rccl' from ROCM official website: "
"https://rocmdocs.amd.com/en/latest/Installation_Guide/"
"Installation-Guide.html before install PaddlePaddle.");
#else
std::string warning_msg( std::string warning_msg(
"You may need to install 'nccl2' from NVIDIA official website: " "You may need to install 'nccl2' from NVIDIA official website: "
"https://developer.nvidia.com/nccl/nccl-download" "https://developer.nvidia.com/nccl/nccl-download"
"before install PaddlePaddle."); "before install PaddlePaddle.");
#endif
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {}, return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
warning_msg); warning_msg);
#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true, {},
warning_msg);
#else #else
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {}, return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
warning_msg); warning_msg);
......
...@@ -44,6 +44,8 @@ inline const char* miopenGetErrorString(miopenStatus_t status) { ...@@ -44,6 +44,8 @@ inline const char* miopenGetErrorString(miopenStatus_t status) {
return "MIOPEN_STATUS_INTERNAL_ERROR"; return "MIOPEN_STATUS_INTERNAL_ERROR";
case miopenStatusNotImplemented: case miopenStatusNotImplemented:
return "MIOPEN_STATUS_NOT_IMPLEMENTED"; return "MIOPEN_STATUS_NOT_IMPLEMENTED";
case miopenStatusUnsupportedOp:
return "MIOPEN_STATUS_UNSUPPORTED_OP";
case miopenStatusUnknownError: case miopenStatusUnknownError:
default: default:
return "MIOPEN_STATUS_UNKNOWN_ERROR"; return "MIOPEN_STATUS_UNKNOWN_ERROR";
...@@ -70,6 +72,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -70,6 +72,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed miopen functions in HPPL * include all needed miopen functions in HPPL
**/ **/
#define MIOPEN_DNN_ROUTINE_EACH(__macro) \ #define MIOPEN_DNN_ROUTINE_EACH(__macro) \
__macro(miopenGetVersion); \
__macro(miopenSet4dTensorDescriptor); \ __macro(miopenSet4dTensorDescriptor); \
__macro(miopenSetTensorDescriptor); \ __macro(miopenSetTensorDescriptor); \
__macro(miopenInitConvolutionNdDescriptor); \ __macro(miopenInitConvolutionNdDescriptor); \
...@@ -80,6 +83,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -80,6 +83,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenGetTensorDescriptor); \ __macro(miopenGetTensorDescriptor); \
__macro(miopenCreateTensorDescriptor); \ __macro(miopenCreateTensorDescriptor); \
__macro(miopenDestroyTensorDescriptor); \ __macro(miopenDestroyTensorDescriptor); \
__macro(miopenGetTensorDescriptorSize); \
__macro(miopenSet2dPoolingDescriptor); \ __macro(miopenSet2dPoolingDescriptor); \
__macro(miopenGet2dPoolingDescriptor); \ __macro(miopenGet2dPoolingDescriptor); \
__macro(miopenGetPoolingNdForwardOutputDim); \ __macro(miopenGetPoolingNdForwardOutputDim); \
...@@ -109,9 +113,12 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -109,9 +113,12 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenSoftmaxBackward); \ __macro(miopenSoftmaxBackward); \
__macro(miopenSoftmaxForward); \ __macro(miopenSoftmaxForward); \
__macro(miopenCreateDropoutDescriptor); \ __macro(miopenCreateDropoutDescriptor); \
__macro(miopenDestroyDropoutDescriptor); \
__macro(miopenRestoreDropoutDescriptor); \
__macro(miopenDropoutGetStatesSize); \ __macro(miopenDropoutGetStatesSize); \
__macro(miopenSetDropoutDescriptor); \ __macro(miopenSetDropoutDescriptor); \
__macro(miopenCreateRNNDescriptor); \ __macro(miopenCreateRNNDescriptor); \
__macro(miopenDestroyRNNDescriptor); \
__macro(miopenSetRNNDescriptor); \ __macro(miopenSetRNNDescriptor); \
__macro(miopenGetRNNParamsSize); \ __macro(miopenGetRNNParamsSize); \
__macro(miopenGetRNNWorkspaceSize); \ __macro(miopenGetRNNWorkspaceSize); \
...@@ -120,8 +127,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -120,8 +127,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(miopenRNNBackwardData); \ __macro(miopenRNNBackwardData); \
__macro(miopenRNNBackwardWeights); \ __macro(miopenRNNBackwardWeights); \
__macro(miopenRNNForwardInference); \ __macro(miopenRNNForwardInference); \
__macro(miopenDestroyDropoutDescriptor); \ __macro(miopenGetTensorNumBytes);
__macro(miopenDestroyRNNDescriptor);
MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP) MIOPEN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MIOPEN_WRAP)
......
...@@ -25,6 +25,14 @@ void *rccl_dso_handle; ...@@ -25,6 +25,14 @@ void *rccl_dso_handle;
RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
#if NCCL_VERSION_CODE >= 2212
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -59,6 +59,18 @@ extern void* rccl_dso_handle; ...@@ -59,6 +59,18 @@ extern void* rccl_dso_handle;
RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP) RCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#if NCCL_VERSION_CODE >= 2212
#define RCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
#if NCCL_VERSION_CODE >= 2703
#define RCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
__macro(ncclSend); \
__macro(ncclRecv);
RCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
#endif
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -36,12 +36,11 @@ extern void *rocblas_dso_handle; ...@@ -36,12 +36,11 @@ extern void *rocblas_dso_handle;
* *
* note: default dynamic linked libs * note: default dynamic linked libs
*/ */
#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ #define DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP(__name) \
struct DynLoad__##__name { \ struct DynLoad__##__name { \
template <typename... Args> \ template <typename... Args> \
inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ rocblas_status operator()(Args... args) { \
using rocblas_func = \ using rocblas_func = decltype(&::__name); \
decltype(::__name(std::declval<Args>()...)) (*)(Args...); \
std::call_once(rocblas_dso_flag, []() { \ std::call_once(rocblas_dso_flag, []() { \
rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \ rocblas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \ }); \
...@@ -51,56 +50,65 @@ extern void *rocblas_dso_handle; ...@@ -51,56 +50,65 @@ extern void *rocblas_dso_handle;
}; \ }; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(rocblas_saxpy); \ __macro(rocblas_caxpy); \
__macro(rocblas_daxpy); \ __macro(rocblas_saxpy); \
__macro(rocblas_sscal); \ __macro(rocblas_daxpy); \
__macro(rocblas_dscal); \ __macro(rocblas_zaxpy); \
__macro(rocblas_scopy); \ __macro(rocblas_sscal); \
__macro(rocblas_dcopy); \ __macro(rocblas_dscal); \
__macro(rocblas_sgemv); \ __macro(rocblas_scopy); \
__macro(rocblas_dgemv); \ __macro(rocblas_dcopy); \
__macro(rocblas_sgemm); \ __macro(rocblas_cgemv); \
__macro(rocblas_dgemm); \ __macro(rocblas_sgemv); \
__macro(rocblas_hgemm); \ __macro(rocblas_zgemv); \
__macro(rocblas_dgeam); \ __macro(rocblas_dgemv); \
/*rocblas_gemm_ex function not support at rocm3.5*/ \ __macro(rocblas_cgemm); \
/*__macro(rocblas_gemm_ex); */ \ __macro(rocblas_sgemm); \
__macro(rocblas_sgemm_batched); \ __macro(rocblas_dgemm); \
__macro(rocblas_dgemm_batched); \ __macro(rocblas_hgemm); \
__macro(rocblas_cgemm_batched); \ __macro(rocblas_zgemm); \
__macro(rocblas_zgemm_batched); \ __macro(rocblas_sgeam); \
__macro(rocblas_create_handle); \ __macro(rocblas_strsm); \
__macro(rocblas_destroy_handle); \ __macro(rocblas_dtrsm); \
__macro(rocblas_add_stream); \ __macro(rocblas_dgeam); \
__macro(rocblas_set_stream); \ __macro(rocblas_sgemm_batched); \
__macro(rocblas_get_stream); \ __macro(rocblas_dgemm_batched); \
__macro(rocblas_set_pointer_mode); \ __macro(rocblas_cgemm_batched); \
__macro(rocblas_zgemm_batched); \
__macro(rocblas_create_handle); \
__macro(rocblas_destroy_handle); \
__macro(rocblas_set_stream); \
__macro(rocblas_get_stream); \
__macro(rocblas_set_pointer_mode); \
__macro(rocblas_get_pointer_mode); __macro(rocblas_get_pointer_mode);
ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
// APIs available after CUDA 8.0
#define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
__macro(rocblas_gemm_ex); \
__macro(rocblas_sgemm_strided_batched); \ __macro(rocblas_sgemm_strided_batched); \
__macro(rocblas_dgemm_strided_batched); \ __macro(rocblas_dgemm_strided_batched); \
__macro(rocblas_cgemm_strided_batched); \ __macro(rocblas_cgemm_strided_batched); \
__macro(rocblas_zgemm_strided_batched); \ __macro(rocblas_zgemm_strided_batched); \
__macro(rocblas_hgemm_strided_batched); __macro(rocblas_hgemm_strided_batched);
ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro) // HIP not supported in ROCM3.5
// #define ROCBLAS_BLAS_ROUTINE_EACH_R3(__macro)
ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // __macro(cublasSetMathMode);
// __macro(cublasGetMathMode);
// ROCBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ #define ROCBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
__macro(rocblas_gemm_batched_ex); \ __macro(rocblas_gemm_batched_ex); \
// rocm not support now(rocm3.5) __macro(rocblas_gemm_strided_batched_ex);
// __macro(rocblas_gemm_strided_batched_ex);
ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) ROCBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP)
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP #undef DECLARE_DYNAMIC_LOAD_ROCBLAS_WRAP
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -55,6 +55,7 @@ extern bool HasCUDADriver(); ...@@ -55,6 +55,7 @@ extern bool HasCUDADriver();
__macro(hipModuleLaunchKernel); \ __macro(hipModuleLaunchKernel); \
__macro(hipLaunchKernel); \ __macro(hipLaunchKernel); \
__macro(hipGetDevice); \ __macro(hipGetDevice); \
__macro(hipGetDeviceCount); \
__macro(hipDevicePrimaryCtxGetState) __macro(hipDevicePrimaryCtxGetState)
ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP); ROCM_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCM_WRAP);
......
...@@ -34,10 +34,18 @@ limitations under the License. */ ...@@ -34,10 +34,18 @@ limitations under the License. */
#include <curand.h> #include <curand.h>
#include <thrust/system/cuda/error.h> #include <thrust/system/cuda/error.h>
#include <thrust/system_error.h> #include <thrust/system_error.h>
#include "paddle/fluid/platform/cuda_error.pb.h" #include "paddle/fluid/platform/cuda_error.pb.h"
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include <hiprand.h>
#include <miopen/miopen.h>
#include <rocblas.h>
#include <thrust/system/hip/error.h>
#include <thrust/system_error.h> // NOLINT
#include "paddle/fluid/platform/cuda_error.pb.h" // NOLINT
#endif
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#include <memory> #include <memory>
...@@ -72,9 +80,23 @@ limitations under the License. */ ...@@ -72,9 +80,23 @@ limitations under the License. */
#endif // __APPLE__ #endif // __APPLE__
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hiprand.h"
#include "paddle/fluid/platform/dynload/miopen.h"
#include "paddle/fluid/platform/dynload/rocblas.h"
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
#include <error.h> // NOLINT
#include "paddle/fluid/platform/dynload/rccl.h"
#endif // __APPLE__
#endif // PADDLE_WITH_HIP
// Note: these headers for simplify demangle type string // Note: these headers for simplify demangle type string
#include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
// Note: this header for simplify HIP and CUDA type string
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/type_defs.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -82,7 +104,7 @@ class ErrorSummary; ...@@ -82,7 +104,7 @@ class ErrorSummary;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_int64(gpu_allocator_retry_time); DECLARE_int64(gpu_allocator_retry_time);
#endif #endif
DECLARE_int32(call_stack_level); DECLARE_int32(call_stack_level);
...@@ -406,6 +428,15 @@ struct EnforceNotMet : public std::exception { ...@@ -406,6 +428,15 @@ struct EnforceNotMet : public std::exception {
asm("trap;"); \ asm("trap;"); \
} \ } \
} while (0) } while (0)
#elif defined(__HIPCC__)
#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \
do { \
if (!(_IS_NOT_ERROR)) { \
printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", __FILE__, \
__LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__); \
abort(); \
} \
} while (0)
#else #else
#define PADDLE_ENFORCE(COND, ...) \ #define PADDLE_ENFORCE(COND, ...) \
do { \ do { \
...@@ -996,5 +1027,188 @@ inline void retry_sleep(unsigned milliseconds) { ...@@ -996,5 +1027,188 @@ inline void retry_sleep(unsigned milliseconds) {
#undef DEFINE_CUDA_STATUS_TYPE #undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
/** HIP PADDLE ENFORCE FUNCTIONS AND MACROS **/
#ifdef PADDLE_WITH_HIP
/***** HIP ERROR *****/
inline bool is_error(hipError_t e) { return e != hipSuccess; }
inline std::string build_rocm_error_msg(hipError_t e) {
#if defined(PADDLE_WITH_HIP)
int32_t cuda_version = 100;
#else
int32_t cuda_version = -1;
#endif
std::ostringstream sout;
sout << " Hip error(" << e << "), " << hipGetErrorString(e) << ".";
return sout.str();
}
/** HIPRAND ERROR **/
inline bool is_error(hiprandStatus_t stat) {
return stat != HIPRAND_STATUS_SUCCESS;
}
inline const char* hiprandGetErrorString(hiprandStatus_t stat) {
switch (stat) {
case HIPRAND_STATUS_SUCCESS:
return "HIPRAND_STATUS_SUCCESS";
case HIPRAND_STATUS_VERSION_MISMATCH:
return "HIPRAND_STATUS_VERSION_MISMATCH";
case HIPRAND_STATUS_NOT_INITIALIZED:
return "HIPRAND_STATUS_NOT_INITIALIZED";
case HIPRAND_STATUS_ALLOCATION_FAILED:
return "HIPRAND_STATUS_ALLOCATION_FAILED";
case HIPRAND_STATUS_TYPE_ERROR:
return "HIPRAND_STATUS_TYPE_ERROR";
case HIPRAND_STATUS_OUT_OF_RANGE:
return "HIPRAND_STATUS_OUT_OF_RANGE";
case HIPRAND_STATUS_LENGTH_NOT_MULTIPLE:
return "HIPRAND_STATUS_LENGTH_NOT_MULTIPLE";
case HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED:
return "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED";
case HIPRAND_STATUS_LAUNCH_FAILURE:
return "HIPRAND_STATUS_LAUNCH_FAILURE";
case HIPRAND_STATUS_PREEXISTING_FAILURE:
return "HIPRAND_STATUS_PREEXISTING_FAILURE";
case HIPRAND_STATUS_INITIALIZATION_FAILED:
return "HIPRAND_STATUS_INITIALIZATION_FAILED";
case HIPRAND_STATUS_ARCH_MISMATCH:
return "HIPRAND_STATUS_ARCH_MISMATCH";
case HIPRAND_STATUS_INTERNAL_ERROR:
return "HIPRAND_STATUS_INTERNAL_ERROR";
case HIPRAND_STATUS_NOT_IMPLEMENTED:
return "HIPRAND_STATUS_NOT_IMPLEMENTED";
default:
return "Unknown hiprand status";
}
}
inline std::string build_rocm_error_msg(hiprandStatus_t stat) {
std::string msg(" Hiprand error, ");
return msg + hiprandGetErrorString(stat) + " ";
}
/***** MIOPEN ERROR *****/
inline bool is_error(miopenStatus_t stat) {
return stat != miopenStatusSuccess;
}
inline std::string build_rocm_error_msg(miopenStatus_t stat) {
std::string msg(" Miopen error, ");
return msg + platform::dynload::miopenGetErrorString(stat) + " ";
}
/***** ROCBLAS ERROR *****/
inline bool is_error(rocblas_status stat) {
return stat != rocblas_status_success;
}
inline const char* rocblasGetErrorString(rocblas_status stat) {
switch (stat) {
case rocblas_status_invalid_handle:
return "rocblas_status_invalid_handle";
case rocblas_status_memory_error:
return "rocblas_status_memory_error";
case rocblas_status_invalid_value:
return "rocblas_status_invalid_value";
case rocblas_status_not_implemented:
return "rocblas_status_not_implemented";
case rocblas_status_invalid_pointer:
return "rocblas_status_invalid_pointer";
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
case rocblas_status_internal_error:
return "rocblas_status_internal_error";
default:
return "Unknown cublas status";
}
}
inline std::string build_rocm_error_msg(rocblas_status stat) {
std::string msg(" Rocblas error, ");
return msg + rocblasGetErrorString(stat) + " ";
}
/****** RCCL ERROR ******/
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
inline bool is_error(ncclResult_t nccl_result) {
return nccl_result != ncclSuccess;
}
inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
std::string msg(" Rccl error, ");
return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
}
#endif // not(__APPLE__) and PADDLE_WITH_NCCL
namespace details {
template <typename T>
struct CudaStatusType {};
#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
template <> \
struct CudaStatusType<type> { \
using Type = type; \
static constexpr Type kSuccess = success_value; \
}
DEFINE_CUDA_STATUS_TYPE(hipError_t, hipSuccess);
DEFINE_CUDA_STATUS_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
DEFINE_CUDA_STATUS_TYPE(miopenStatus_t, miopenStatusSuccess);
DEFINE_CUDA_STATUS_TYPE(rocblas_status, rocblas_status_success);
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
#endif
} // namespace details
#define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_rocm_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
inline void retry_sleep(unsigned millisecond) {
#ifdef _WIN32
Sleep(millisecond);
#else
sleep(millisecond);
#endif
}
#define PADDLE_RETRY_CUDA_SUCCESS(COND) \
do { \
auto __cond__ = (COND); \
int retry_count = 1; \
using __CUDA_STATUS_TYPE__ = decltype(__cond__); \
constexpr auto __success_type__ = \
::paddle::platform::details::CudaStatusType< \
__CUDA_STATUS_TYPE__>::kSuccess; \
while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
retry_sleep(FLAGS_gpu_allocator_retry_time); \
__cond__ = (COND); \
++retry_count; \
} \
if (UNLIKELY(__cond__ != __success_type__)) { \
auto __summary__ = ::paddle::platform::errors::External( \
::paddle::platform::build_rocm_error_msg(__cond__)); \
__THROW_ERROR_INTERNAL__(__summary__); \
} \
} while (0)
#undef DEFINE_CUDA_STATUS_TYPE
#endif // PADDLE_WITH_HIP
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -295,7 +295,7 @@ TEST(EOF_EXCEPTION, THROW_EOF) { ...@@ -295,7 +295,7 @@ TEST(EOF_EXCEPTION, THROW_EOF) {
EXPECT_TRUE(caught_eof); EXPECT_TRUE(caught_eof);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T> template <typename T>
bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") {
PADDLE_ENFORCE_CUDA_SUCCESS(value); PADDLE_ENFORCE_CUDA_SUCCESS(value);
...@@ -312,7 +312,35 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) { ...@@ -312,7 +312,35 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
return ex_msg.find(msg) != std::string::npos; return ex_msg.find(msg) != std::string::npos;
} }
} }
#ifdef PADDLE_WITH_HIP
TEST(enforce, hip_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(hipSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(hipErrorInvalidValue, "Hip error"));
EXPECT_TRUE(CheckCudaStatusFailure(hipErrorOutOfMemory, "Hip error"));
EXPECT_TRUE(CheckCudaStatusSuccess(HIPRAND_STATUS_SUCCESS));
EXPECT_TRUE(
CheckCudaStatusFailure(HIPRAND_STATUS_VERSION_MISMATCH, "Hiprand error"));
EXPECT_TRUE(
CheckCudaStatusFailure(HIPRAND_STATUS_NOT_INITIALIZED, "Hiprand error"));
EXPECT_TRUE(CheckCudaStatusSuccess(miopenStatusSuccess));
EXPECT_TRUE(
CheckCudaStatusFailure(miopenStatusNotInitialized, "Miopen error"));
EXPECT_TRUE(CheckCudaStatusFailure(miopenStatusAllocFailed, "Miopen error"));
EXPECT_TRUE(CheckCudaStatusSuccess(rocblas_status_success));
EXPECT_TRUE(
CheckCudaStatusFailure(rocblas_status_invalid_handle, "Rocblas error"));
EXPECT_TRUE(
CheckCudaStatusFailure(rocblas_status_invalid_value, "Rocblas error"));
#if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));
EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
#endif
}
#else
TEST(enforce, cuda_success) { TEST(enforce, cuda_success) {
EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
...@@ -341,6 +369,7 @@ TEST(enforce, cuda_success) { ...@@ -341,6 +369,7 @@ TEST(enforce, cuda_success) {
#endif #endif
} }
#endif #endif
#endif
struct CannotToStringType { struct CannotToStringType {
explicit CannotToStringType(int num) : num_(num) {} explicit CannotToStringType(int num) : num_(num) {}
......
...@@ -20,8 +20,8 @@ limitations under the License. */ ...@@ -20,8 +20,8 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#define CUDA_VERSION 10000
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
...@@ -41,6 +41,7 @@ limitations under the License. */ ...@@ -41,6 +41,7 @@ limitations under the License. */
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif #endif
#ifdef __HIPCC__ #ifdef __HIPCC__
#define PADDLE_CUDA_FP16 #define PADDLE_CUDA_FP16
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>
...@@ -90,7 +91,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -90,7 +91,7 @@ struct PADDLE_ALIGN(2) float16 {
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit float16(const half& h) { HOSTDEVICE inline explicit float16(const half& h) {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP))
#if CUDA_VERSION >= 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x; x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
#else #else
x = h.x; x = h.x;
...@@ -110,9 +111,8 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -110,9 +111,8 @@ struct PADDLE_ALIGN(2) float16 {
#endif #endif
HOSTDEVICE inline explicit float16(float val) { HOSTDEVICE inline explicit float16(float val) {
#if ((defined(PADDLE_CUDA_FP16)) && \ #if defined(PADDLE_CUDA_FP16) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
(defined(__HIP_DEVICE_COMPILE__))))
half tmp = __float2half(val); half tmp = __float2half(val);
x = *reinterpret_cast<uint16_t*>(&tmp); x = *reinterpret_cast<uint16_t*>(&tmp);
...@@ -154,7 +154,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -154,7 +154,7 @@ struct PADDLE_ALIGN(2) float16 {
// Assignment operators // Assignment operators
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline float16& operator=(const half& rhs) { HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x; x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
#else #else
x = rhs.x; x = rhs.x;
...@@ -233,7 +233,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -233,7 +233,7 @@ struct PADDLE_ALIGN(2) float16 {
// Conversion opertors // Conversion opertors
#ifdef PADDLE_CUDA_FP16 #ifdef PADDLE_CUDA_FP16
HOSTDEVICE inline explicit operator half() const { HOSTDEVICE inline explicit operator half() const {
#if CUDA_VERSION >= 9000 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
__half_raw h; __half_raw h;
h.x = x; h.x = x;
return half(h); return half(h);
...@@ -258,9 +258,8 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -258,9 +258,8 @@ struct PADDLE_ALIGN(2) float16 {
#endif #endif
HOSTDEVICE inline explicit operator float() const { HOSTDEVICE inline explicit operator float() const {
#if (defined(PADDLE_CUDA_FP16) && \ #if defined(PADDLE_CUDA_FP16) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
(defined(__HIP_DEVICE_COMPILE__))))
half tmp = *reinterpret_cast<const half*>(this); half tmp = *reinterpret_cast<const half*>(this);
return __half2float(tmp); return __half2float(tmp);
...@@ -370,8 +369,7 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -370,8 +369,7 @@ struct PADDLE_ALIGN(2) float16 {
// xuan[TODO] change for rocm // xuan[TODO] change for rocm
#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 #if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) { DEVICE inline half operator+(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hadd(a, b); return __hadd(a, b);
#else #else
float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
...@@ -380,8 +378,7 @@ DEVICE inline half operator+(const half& a, const half& b) { ...@@ -380,8 +378,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
} }
DEVICE inline half operator-(const half& a, const half& b) { DEVICE inline half operator-(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hsub(a, b); return __hsub(a, b);
#else #else
float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
...@@ -390,8 +387,7 @@ DEVICE inline half operator-(const half& a, const half& b) { ...@@ -390,8 +387,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
} }
DEVICE inline half operator*(const half& a, const half& b) { DEVICE inline half operator*(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hmul(a, b); return __hmul(a, b);
#else #else
float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b)); float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
...@@ -400,8 +396,7 @@ DEVICE inline half operator*(const half& a, const half& b) { ...@@ -400,8 +396,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
} }
DEVICE inline half operator/(const half& a, const half& b) { DEVICE inline half operator/(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
float num = __half2float(a); float num = __half2float(a);
float denom = __half2float(b); float denom = __half2float(b);
return __float2half(num / denom); return __float2half(num / denom);
...@@ -412,8 +407,7 @@ DEVICE inline half operator/(const half& a, const half& b) { ...@@ -412,8 +407,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
} }
DEVICE inline half operator-(const half& a) { DEVICE inline half operator-(const half& a) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hneg(a); return __hneg(a);
#else #else
float res = -static_cast<float>(float16(a)); float res = -static_cast<float>(float16(a));
...@@ -421,6 +415,7 @@ DEVICE inline half operator-(const half& a) { ...@@ -421,6 +415,7 @@ DEVICE inline half operator-(const half& a) {
#endif #endif
} }
#ifndef PADDLE_WITH_HIP // not defined __HIP_NO_HALF_OPERATORS__
DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT
a = a + b; a = a + b;
return a; return a;
...@@ -440,10 +435,10 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT ...@@ -440,10 +435,10 @@ DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT
a = a / b; a = a / b;
return a; return a;
} }
#endif
DEVICE inline bool operator==(const half& a, const half& b) { DEVICE inline bool operator==(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __heq(a, b); return __heq(a, b);
#else #else
return static_cast<float>(float16(a)) == static_cast<float>(float16(b)); return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
...@@ -451,8 +446,7 @@ DEVICE inline bool operator==(const half& a, const half& b) { ...@@ -451,8 +446,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
} }
DEVICE inline bool operator!=(const half& a, const half& b) { DEVICE inline bool operator!=(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hne(a, b); return __hne(a, b);
#else #else
return static_cast<float>(float16(a)) != static_cast<float>(float16(b)); return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
...@@ -460,8 +454,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) { ...@@ -460,8 +454,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
} }
DEVICE inline bool operator<(const half& a, const half& b) { DEVICE inline bool operator<(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hlt(a, b); return __hlt(a, b);
#else #else
return static_cast<float>(float16(a)) < static_cast<float>(float16(b)); return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
...@@ -469,8 +462,7 @@ DEVICE inline bool operator<(const half& a, const half& b) { ...@@ -469,8 +462,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
} }
DEVICE inline bool operator<=(const half& a, const half& b) { DEVICE inline bool operator<=(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hle(a, b); return __hle(a, b);
#else #else
return static_cast<float>(float16(a)) <= static_cast<float>(float16(b)); return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
...@@ -478,8 +470,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) { ...@@ -478,8 +470,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
} }
DEVICE inline bool operator>(const half& a, const half& b) { DEVICE inline bool operator>(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hgt(a, b); return __hgt(a, b);
#else #else
return static_cast<float>(float16(a)) > static_cast<float>(float16(b)); return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
...@@ -487,8 +478,7 @@ DEVICE inline bool operator>(const half& a, const half& b) { ...@@ -487,8 +478,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
} }
DEVICE inline bool operator>=(const half& a, const half& b) { DEVICE inline bool operator>=(const half& a, const half& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hge(a, b); return __hge(a, b);
#else #else
return static_cast<float>(float16(a)) >= static_cast<float>(float16(b)); return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
...@@ -499,36 +489,66 @@ DEVICE inline bool operator>=(const half& a, const half& b) { ...@@ -499,36 +489,66 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
// Arithmetic operators for float16 on GPU // Arithmetic operators for float16 on GPU
#if defined(PADDLE_CUDA_FP16) #if defined(PADDLE_CUDA_FP16)
// HIPCC has compile error if call __device__ function __hadd in __host__
// __device__ function
#if defined(__HIPCC__)
DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b)));
}
HOST inline float16 operator+(const float16& a, const float16& b) {
return float16(static_cast<float>(a) + static_cast<float>(b));
}
#else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hadd(half(a), half(b))); return float16(__hadd(half(a), half(b)));
#else #else
return float16(static_cast<float>(a) + static_cast<float>(b)); return float16(static_cast<float>(a) + static_cast<float>(b));
#endif #endif
} }
#endif
// HIPCC has compile error if call __device__ function __hsub in __host__
// __device__ function
#if defined(__HIPCC__)
DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b)));
}
HOST inline float16 operator-(const float16& a, const float16& b) {
return float16(static_cast<float>(a) - static_cast<float>(b));
}
#else
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hsub(half(a), half(b))); return float16(__hsub(half(a), half(b)));
#else #else
return float16(static_cast<float>(a) - static_cast<float>(b)); return float16(static_cast<float>(a) - static_cast<float>(b));
#endif #endif
} }
#endif
// HIPCC has compile error if call __device__ function __hmul in __host__
// __device__ function
#if defined(__HIPCC__)
DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b)));
}
HOST inline float16 operator*(const float16& a, const float16& b) {
return float16(static_cast<float>(a) * static_cast<float>(b));
}
#else
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hmul(half(a), half(b))); return float16(__hmul(half(a), half(b)));
#else #else
return float16(static_cast<float>(a) * static_cast<float>(b)); return float16(static_cast<float>(a) * static_cast<float>(b));
#endif #endif
} }
#endif
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
// TODO(kexinzhao): check which cuda version starts to support __hdiv // TODO(kexinzhao): check which cuda version starts to support __hdiv
float num = __half2float(half(a)); float num = __half2float(half(a));
float denom = __half2float(half(b)); float denom = __half2float(half(b));
...@@ -538,9 +558,20 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { ...@@ -538,9 +558,20 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#endif #endif
} }
// HIPCC has compile error if call __device__ function __hneg in __host__
// __device__ function
#if defined(__HIPCC__)
DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a)));
}
HOST inline float16 operator-(const float16& a) {
float16 res;
res.x = a.x ^ 0x8000;
return res;
}
#else
HOSTDEVICE inline float16 operator-(const float16& a) { HOSTDEVICE inline float16 operator-(const float16& a) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
(defined(__HIP_DEVICE_COMPILE__)))
return float16(__hneg(half(a))); return float16(__hneg(half(a)));
#else #else
float16 res; float16 res;
...@@ -548,6 +579,7 @@ HOSTDEVICE inline float16 operator-(const float16& a) { ...@@ -548,6 +579,7 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
return res; return res;
#endif #endif
} }
#endif
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = a + b; a = a + b;
...@@ -569,18 +601,27 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT ...@@ -569,18 +601,27 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT
return a; return a;
} }
// HIPCC has compile error if call __device__ function __heq in __host__
// __device__ function
#if defined(__HIPCC__)
DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b));
}
HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<float>(a) == static_cast<float>(b);
}
#else // CUDA
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
(defined(__HIP_DEVICE_COMPILE__)))
return __heq(half(a), half(b)); return __heq(half(a), half(b));
#else #else
return static_cast<float>(a) == static_cast<float>(b); return static_cast<float>(a) == static_cast<float>(b);
#endif #endif
} }
#endif
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hne(half(a), half(b)); return __hne(half(a), half(b));
#else #else
return static_cast<float>(a) != static_cast<float>(b); return static_cast<float>(a) != static_cast<float>(b);
...@@ -588,8 +629,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { ...@@ -588,8 +629,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
} }
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hlt(half(a), half(b)); return __hlt(half(a), half(b));
#else #else
return static_cast<float>(a) < static_cast<float>(b); return static_cast<float>(a) < static_cast<float>(b);
...@@ -597,8 +637,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { ...@@ -597,8 +637,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
} }
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hle(half(a), half(b)); return __hle(half(a), half(b));
#else #else
return static_cast<float>(a) <= static_cast<float>(b); return static_cast<float>(a) <= static_cast<float>(b);
...@@ -606,8 +645,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { ...@@ -606,8 +645,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
} }
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hgt(half(a), half(b)); return __hgt(half(a), half(b));
#else #else
return static_cast<float>(a) > static_cast<float>(b); return static_cast<float>(a) > static_cast<float>(b);
...@@ -615,8 +653,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { ...@@ -615,8 +653,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
} }
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ #if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
(defined(__HIP_DEVICE_COMPILE__)))
return __hge(half(a), half(b)); return __hge(half(a), half(b));
#else #else
return static_cast<float>(a) >= static_cast<float>(b); return static_cast<float>(a) >= static_cast<float>(b);
...@@ -881,15 +918,20 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { ...@@ -881,15 +918,20 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
return res; return res;
} }
// HIPCC has compile error if call __device__ function __hisnan in __host__
// __device__ function
#if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__)
DEVICE inline bool(isnan)(const float16& a) { return __hisnan(half(a)); }
HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
#else
HOSTDEVICE inline bool(isnan)(const float16& a) { HOSTDEVICE inline bool(isnan)(const float16& a) {
#if (defined(PADDLE_CUDA_FP16) && \ #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \
(defined(__HIP_DEVICE_COMPILE__))))
return __hisnan(half(a)); return __hisnan(half(a));
#else #else
return (a.x & 0x7fff) > 0x7c00; return (a.x & 0x7fff) > 0x7c00;
#endif #endif
} }
#endif
HOSTDEVICE inline bool(isinf)(const float16& a) { HOSTDEVICE inline bool(isinf)(const float16& a) {
return (a.x & 0x7fff) == 0x7c00; return (a.x & 0x7fff) == 0x7c00;
...@@ -900,9 +942,8 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) { ...@@ -900,9 +942,8 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
} }
HOSTDEVICE inline float16(abs)(const float16& a) { HOSTDEVICE inline float16(abs)(const float16& a) {
#if (defined(PADDLE_CUDA_FP16) && \ #if defined(PADDLE_CUDA_FP16) && \
((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
(defined(__HIP_DEVICE_COMPILE__))))
return float16(::fabs(static_cast<float>(a))); return float16(::fabs(static_cast<float>(a)));
#else #else
return float16(std::abs(static_cast<float>(a))); return float16(std::abs(static_cast<float>(a)));
......
IF(WITH_GPU) IF(WITH_GPU OR WITH_ROCM)
cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost) cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
ENDIF() ENDIF()
...@@ -20,7 +20,11 @@ namespace paddle { ...@@ -20,7 +20,11 @@ namespace paddle {
namespace platform { namespace platform {
namespace stream { namespace stream {
#ifdef PADDLE_WITH_HIP
constexpr unsigned int kDefaultFlag = hipStreamDefault;
#else
constexpr unsigned int kDefaultFlag = cudaStreamDefault; constexpr unsigned int kDefaultFlag = cudaStreamDefault;
#endif
bool CUDAStream::Init(const Place& place, const Priority& priority) { bool CUDAStream::Init(const Place& place, const Priority& priority) {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true, PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
...@@ -29,11 +33,21 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) { ...@@ -29,11 +33,21 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
place_ = place; place_ = place;
CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device); CUDADeviceGuard guard(BOOST_GET_CONST(CUDAPlace, place_).device);
if (priority == Priority::kHigh) { if (priority == Priority::kHigh) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1)); cudaStreamCreateWithPriority(&stream_, kDefaultFlag, -1));
#endif
} else if (priority == Priority::kNormal) { } else if (priority == Priority::kNormal) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0)); cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
#endif
} }
callback_manager_.reset(new StreamCallbackManager(stream_)); callback_manager_.reset(new StreamCallbackManager(stream_));
VLOG(3) << "CUDAStream Init stream: " << stream_ VLOG(3) << "CUDAStream Init stream: " << stream_
...@@ -46,12 +60,27 @@ void CUDAStream::Destroy() { ...@@ -46,12 +60,27 @@ void CUDAStream::Destroy() {
Wait(); Wait();
WaitCallback(); WaitCallback();
if (stream_) { if (stream_) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
#endif
} }
stream_ = nullptr; stream_ = nullptr;
} }
void CUDAStream::Wait() const { void CUDAStream::Wait() const {
#ifdef PADDLE_WITH_HIP
hipError_t e_sync = hipSuccess;
#if !defined(_WIN32)
e_sync = hipStreamSynchronize(stream_);
#else
while (e_sync = hipStreamQuery(stream_)) {
if (e_sync == hipErrorNotReady) continue;
break;
}
#endif
#else
cudaError_t e_sync = cudaSuccess; cudaError_t e_sync = cudaSuccess;
#if !defined(_WIN32) #if !defined(_WIN32)
e_sync = cudaStreamSynchronize(stream_); e_sync = cudaStreamSynchronize(stream_);
...@@ -61,6 +90,7 @@ void CUDAStream::Wait() const { ...@@ -61,6 +90,7 @@ void CUDAStream::Wait() const {
break; break;
} }
#endif #endif
#endif // PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(e_sync); PADDLE_ENFORCE_CUDA_SUCCESS(e_sync);
} }
......
...@@ -26,7 +26,7 @@ namespace paddle { ...@@ -26,7 +26,7 @@ namespace paddle {
namespace platform { namespace platform {
namespace stream { namespace stream {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
enum class Priority : uint8_t { enum class Priority : uint8_t {
kNull = 0x0, kNull = 0x0,
...@@ -51,28 +51,55 @@ class CUDAStream final { ...@@ -51,28 +51,55 @@ class CUDAStream final {
} }
template <typename Callback> template <typename Callback>
#ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev, Callback callback) const {
callback();
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
}
#else
void RecordEvent(cudaEvent_t ev, Callback callback) const { void RecordEvent(cudaEvent_t ev, Callback callback) const {
callback(); callback();
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
} }
#endif
#ifdef PADDLE_WITH_HIP
void RecordEvent(hipEvent_t ev) const {
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(ev, stream_));
}
#else
void RecordEvent(cudaEvent_t ev) const { void RecordEvent(cudaEvent_t ev) const {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
} }
#endif
#ifdef PADDLE_WITH_HIP
void WaitEvent(hipEvent_t ev) const {
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(stream_, ev, 0));
}
#else
void WaitEvent(cudaEvent_t ev) const { void WaitEvent(cudaEvent_t ev) const {
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream_, ev, 0));
} }
#endif
void Wait() const; void Wait() const;
void WaitCallback() const { callback_manager_->Wait(); } void WaitCallback() const { callback_manager_->Wait(); }
#ifdef PADDLE_WITH_HIP
const hipStream_t& raw_stream() const { return stream_; }
#else
const cudaStream_t& raw_stream() const { return stream_; } const cudaStream_t& raw_stream() const { return stream_; }
#endif
void Destroy(); void Destroy();
private: private:
Place place_; Place place_;
#ifdef PADDLE_WITH_HIP
hipStream_t stream_{nullptr};
#else
cudaStream_t stream_{nullptr}; cudaStream_t stream_{nullptr};
#endif
Priority priority_{Priority::kNormal}; Priority priority_{Priority::kNormal};
std::unique_ptr<StreamCallbackManager> callback_manager_; std::unique_ptr<StreamCallbackManager> callback_manager_;
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
namespace paddle {
#ifdef PADDLE_WITH_HIP
#define gpuSuccess hipSuccess
using gpuStream_t = hipStream_t;
using gpuError_t = hipError_t;
using gpuEvent_t = hipEvent_t;
#else
#define gpuSuccess cudaSuccess
using gpuStream_t = cudaStream_t;
using gpuError_t = cudaError_t;
using gpuEvent_t = cudaEvent_t;
#endif
} // namespace paddle
...@@ -30,7 +30,7 @@ ENV LC_ALL en_US.UTF-8 ...@@ -30,7 +30,7 @@ ENV LC_ALL en_US.UTF-8
ENV LANG en_US.UTF-8 ENV LANG en_US.UTF-8
ENV LANGUAGE en_US.UTF-8 ENV LANGUAGE en_US.UTF-8
RUN yum install -y epel-release deltarpm sudo openssh-server openssl-devel gettext-devel sqlite-devel \ RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \ zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel
...@@ -65,6 +65,15 @@ RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \ ...@@ -65,6 +65,15 @@ RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \
RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev
# fix rocthrust # fix rocthrust
RUN sed -i '21 a #include <thrust/system/hip/config.h>' /opt/rocm/include/thrust/system/hip/detail/error.inl RUN sed -i '21 a #include <thrust/system/hip/config.h>' /opt/rocm/include/thrust/system/hip/detail/error.inl
# export ROCM env
ENV ROCM_PATH=/opt/rocm
ENV HIP_PATH=/opt/rocm/hip
ENV HIP_CLANG_PATH=/opt/rocm/llvm/bin
ENV PATH=/opt/rocm/bin:$PATH
ENV PATH=/opt/rocm/hcc/bin:$PATH
ENV PATH=/opt/rocm/hip/bin:$PATH
ENV PATH=/opt/rocm/opencl/bin:$PATH
ENV PATH=/opt/rocm/llvm/bin:$PATH
# git 2.17.1 # git 2.17.1
RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ RUN cd /opt && wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
...@@ -117,6 +126,13 @@ RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \ ...@@ -117,6 +126,13 @@ RUN sed -i "s/^#PermitRootLogin/PermitRootLogin/" /etc/ssh/sshd_config && \
sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \ sed -i "s/^#PubkeyAuthentication/PubkeyAuthentication/" /etc/ssh/sshd_config && \
sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config sed -i "s/^#RSAAuthentication/RSAAuthentication/" /etc/ssh/sshd_config
# patchelf
RUN yum install -y patchelf && \
yum clean all && \
rm -rf /var/cache/yum && \
rm -rf /var/lib/yum/yumdb && \
rm -rf /var/lib/yum/history
# swig 2.0.12 # swig 2.0.12
RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \ RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && \ cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册