提交 a5feb771 编写于 作者: K Kexin Zhao

address pr comment

上级 19e5c24f
......@@ -16,9 +16,14 @@ limitations under the License. */
#include <cstdint>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/platform/hostdevice.h"
#ifdef __GNUC__
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
#else
......@@ -31,25 +36,12 @@ limitations under the License. */
#define PADDLE_CLANG_VER 0
#endif // __clang__
#ifdef __CUDACC__
#define PADDLE_HOSTDEVICE __host__ __device__
#if CUDA_VERSION >= 7050
#if defined(__CUDACC__) && CUDA_VERSION >= 7050
#define PADDLE_CUDA_FP16
#include <cuda_fp16.h>
#endif // CUDA_VERSION >= 7050
#else
#define PADDLE_HOSTDEVICE
#endif // __CUDACC__
#ifdef __arm__
#define PADDLE_ARM_32
#endif
#ifdef __aarch64__
#define PADDLE_ARM_64
#endif
#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64)
#if defined(__arm__) || defined(__aarch64__)
#define PADDLE_ARM
#endif
......@@ -58,19 +50,12 @@ limitations under the License. */
#include <arm_neon.h>
#endif
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
#define PADDLE_NEON_32
#endif
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64)
#define PADDLE_NEON_64
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
(PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
#define PADDLE_WITH_NATIVE_FP16
#endif
#ifdef PADDLE_ARM
#ifdef __F16C__
#undef __F16C__
#endif // __F16C__
#else
#ifndef PADDLE_ARM
#include <immintrin.h>
#endif // PADDLE_ARM
......@@ -78,27 +63,20 @@ limitations under the License. */
namespace paddle {
struct float16;
namespace fp16_impl {
// Convert from float to half precision in round-to-nearest-even mode
PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
} // namespace fp16_impl
// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
// and aligned at least on a 2-byte boundary, which leads to efficient
// memory access of float16 struct and also makes float16 compatible
// with CUDA half, ARM float16_t, and Eigen::half data types.
struct PADDLE_ALIGN(2) float16 {
public:
uint16_t x;
PADDLE_HOSTDEVICE inline float16() : x(0) {}
HOSTDEVICE inline float16() : x(0) {}
PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
#ifdef PADDLE_CUDA_FP16
PADDLE_HOSTDEVICE inline float16(const half& h) {
HOSTDEVICE inline explicit float16(const half& h) {
#if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&h)->x;
#else
......@@ -107,78 +85,64 @@ struct PADDLE_ALIGN(2) float16 {
}
#endif // PADDLE_CUDA_FP16
PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
(PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
#ifdef PADDLE_WITH_NATIVE_FP16
// __fp16 is a native half precision data type for arm cpu,
// float16_t is an alias for __fp16 in arm_fp16.h,
// which is included in arm_neon.h.
PADDLE_HOSTDEVICE inline float16(const float16_t& h) {
float16_t tmp = h;
x = *reinterpret_cast<uint16_t*>(&tmp);
HOSTDEVICE inline explicit float16(const float16_t& h) {
x = *reinterpret_cast<const uint16_t*>(&h);
}
#endif
PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
PADDLE_HOSTDEVICE inline explicit float16(int8_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
PADDLE_HOSTDEVICE inline explicit float16(int16_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
HOSTDEVICE inline explicit float16(float val) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
half tmp = __float2half(val);
x = *reinterpret_cast<uint16_t*>(&tmp);
PADDLE_HOSTDEVICE inline explicit float16(int32_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
#elif defined(PADDLE_NEON)
float32x4_t tmp = vld1q_dup_f32(&val);
float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
x = *reinterpret_cast<uint16_t*>(&res);
PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
#elif defined(__F16C__)
x = _cvtss_sh(val, 0);
PADDLE_HOSTDEVICE inline explicit float16(int64_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
#else
// Conversion routine adapted from
// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
Bits v, s;
v.f = val;
uint32_t sign = v.si & sigN;
v.si ^= sign;
sign >>= shiftSign; // logical shift
s.si = mulN;
s.si = s.f * v.f; // correct subnormals
v.si ^= (s.si ^ v.si) & -(minN > v.si);
v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
v.ui >>= shift; // logical shift
v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
x = v.ui | sign;
PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
#endif
}
PADDLE_HOSTDEVICE inline explicit float16(float val) {
float16 res = fp16_impl::float_to_half_rn(val);
x = res.x;
}
HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
PADDLE_HOSTDEVICE inline explicit float16(double val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
}
template <class T>
HOSTDEVICE inline explicit float16(const T& val)
: x(float16(static_cast<float>(val)).x) {}
PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) {
HOSTDEVICE inline float16& operator=(const float16& rhs) {
x = rhs.x;
return *this;
}
#ifdef PADDLE_CUDA_FP16
PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) {
HOSTDEVICE inline float16& operator=(const half& rhs) {
#if CUDA_VERSION >= 9000
x = reinterpret_cast<__half_raw*>(&rhs)->x;
#else
......@@ -188,87 +152,75 @@ struct PADDLE_ALIGN(2) float16 {
}
#endif
PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
x = rhs.x;
return *this;
}
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
(PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
float16_t tmp = rhs;
x = *reinterpret_cast<uint16_t*>(&tmp);
#ifdef PADDLE_WITH_NATIVE_FP16
HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
x = *reinterpret_cast<const uint16_t*>(&rhs);
return *this;
}
#endif
PADDLE_HOSTDEVICE inline float16& operator=(bool b) {
HOSTDEVICE inline float16& operator=(bool b) {
x = b ? 0x3c00 : 0;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(int8_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(uint8_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(int16_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(uint16_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(int32_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(uint32_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(int64_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(uint64_t val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(float val) {
float16 res = fp16_impl::float_to_half_rn(val);
x = res.x;
HOSTDEVICE inline float16& operator=(float val) {
x = float16(val).x;
return *this;
}
PADDLE_HOSTDEVICE inline float16& operator=(double val) {
float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
x = res.x;
HOSTDEVICE inline float16& operator=(double val) {
x = float16(val).x;
return *this;
}
#ifdef PADDLE_CUDA_FP16
PADDLE_HOSTDEVICE inline operator half() const {
HOSTDEVICE inline explicit operator half() const {
#if CUDA_VERSION >= 9000
__half_raw h;
h.x = x;
......@@ -281,186 +233,504 @@ struct PADDLE_ALIGN(2) float16 {
}
#endif // PADDLE_CUDA_FP16
PADDLE_HOSTDEVICE inline operator Eigen::half() const {
HOSTDEVICE inline explicit operator Eigen::half() const {
Eigen::half h;
h.x = x;
return h;
}
#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
(PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
PADDLE_HOSTDEVICE inline operator float16_t() const {
float16 h = *this;
return *reinterpret_cast<float16_t*>(&h);
#ifdef PADDLE_WITH_NATIVE_FP16
HOSTDEVICE inline explicit operator float16_t() const {
return *reinterpret_cast<const float16_t*>(this);
}
#endif
PADDLE_HOSTDEVICE inline explicit operator bool() const {
return (x & 0x7fff) != 0;
}
HOSTDEVICE inline explicit operator float() const {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
half tmp = *reinterpret_cast<const half*>(this);
return __half2float(tmp);
#elif defined(PADDLE_NEON)
float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
return vgetq_lane_f32(vcvt_f32_f16(res), 0);
PADDLE_HOSTDEVICE inline explicit operator int8_t() const {
return static_cast<int8_t>(fp16_impl::half_to_float(*this));
#elif defined(__F16C__)
return _cvtsh_ss(this->x);
#else
// Conversion routine adapted from
// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
Bits v;
v.ui = this->x;
int32_t sign = v.si & sigC;
v.si ^= sign;
sign <<= shiftSign;
v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
Bits s;
s.si = mulC;
s.f *= v.si;
int32_t mask = -(norC > v.si);
v.si <<= shift;
v.si ^= (s.si ^ v.si) & mask;
v.si |= sign;
return v.f;
#endif
}
PADDLE_HOSTDEVICE inline explicit operator uint8_t() const {
return static_cast<uint8_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
HOSTDEVICE inline explicit operator int8_t() const {
return static_cast<int8_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator int16_t() const {
return static_cast<int16_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator uint8_t() const {
return static_cast<uint8_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator uint16_t() const {
return static_cast<uint16_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator int16_t() const {
return static_cast<int16_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator int32_t() const {
return static_cast<int32_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator uint16_t() const {
return static_cast<uint16_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator uint32_t() const {
return static_cast<uint32_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator int32_t() const {
return static_cast<int32_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator int64_t() const {
return static_cast<int64_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator uint32_t() const {
return static_cast<uint32_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator uint64_t() const {
return static_cast<uint64_t>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator int64_t() const {
return static_cast<int64_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator float() const {
return fp16_impl::half_to_float(*this);
HOSTDEVICE inline explicit operator uint64_t() const {
return static_cast<uint64_t>(float(*this));
}
PADDLE_HOSTDEVICE inline explicit operator double() const {
return static_cast<double>(fp16_impl::half_to_float(*this));
HOSTDEVICE inline explicit operator double() const {
return static_cast<double>(float(*this));
}
private:
union Bits {
float f;
int32_t si;
uint32_t ui;
};
static const int shift = 13;
static const int shiftSign = 16;
static const int32_t infN = 0x7F800000;
static const int32_t maxN = 0x477FE000; // max flt16 as flt32
static const int32_t minN = 0x38800000; // min flt16 normal as flt32
static const int32_t sigN = 0x80000000; // sign bit
static constexpr int32_t infC = infN >> shift;
static constexpr int32_t nanN = (infC + 1)
<< shift; // minimum flt16 nan as float32
static constexpr int32_t maxC = maxN >> shift;
static constexpr int32_t minC = minN >> shift;
static constexpr int32_t sigC = sigN >> shiftSign;
static const int32_t mulN = 0x52000000; // (1 << 23) / minN
static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift))
static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted
static const int32_t norC = 0x00400; // min flt32 normal downshifted
static constexpr int32_t maxD = infC - maxC - 1;
static constexpr int32_t minD = minC - subC - 1;
};
// Arithmetic operators
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
__device__ inline float16 operator+(const float16& a, const float16& b) {
// Arithmetic operators on GPU
// CUDA 9.0 provides built-in arithmetic operators for half while
// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
// CUDA 9.0 regarding the half data type.
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
__CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
DEVICE inline half operator+(const half& a, const half& b) {
return __hadd(a, b);
}
DEVICE inline half operator-(const half& a, const half& b) {
return __hsub(a, b);
}
DEVICE inline half operator*(const half& a, const half& b) {
return __hmul(a, b);
}
DEVICE inline half operator/(const half& a, const half& b) {
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
}
DEVICE inline half operator-(const half& a) { return __hneg(a); }
DEVICE inline half& operator+=(half& a, const half& b) {
a = a + b;
return a;
}
DEVICE inline half& operator-=(half& a, const half& b) {
a = a - b;
return a;
}
DEVICE inline half& operator*=(half& a, const half& b) {
a = a * b;
return a;
}
DEVICE inline half& operator/=(half& a, const half& b) {
a = a / b;
return a;
}
DEVICE inline bool operator==(const half& a, const half& b) {
return __heq(a, b);
}
DEVICE inline bool operator!=(const half& a, const half& b) {
return __hne(a, b);
}
DEVICE inline bool operator<(const half& a, const half& b) {
return __hlt(a, b);
}
DEVICE inline bool operator<=(const half& a, const half& b) {
return __hle(a, b);
}
DEVICE inline bool operator>(const half& a, const half& b) {
return __hgt(a, b);
}
DEVICE inline bool operator>=(const half& a, const half& b) {
return __hge(a, b);
}
/*
DEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(__hadd(half(a), half(b)));
}
__device__ inline float16 operator-(const float16& a, const float16& b) {
DEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(__hsub(half(a), half(b)));
}
__device__ inline float16 operator*(const float16& a, const float16& b) {
DEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(__hmul(half(a), half(b)));
}
__device__ inline float16 operator/(const float16& a, const float16& b) {
// TODO(kexinzhao): check the cuda version that starts to support __hdiv
DEVICE inline float16 operator/(const float16& a, const float16& b) {
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
}
__device__ inline float16 operator-(const float16& a) {
DEVICE inline float16 operator-(const float16& a) {
return float16(__hneg(half(a)));
}
__device__ inline float16& operator+=(float16& a, const float16& b) {
DEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
__device__ inline float16& operator-=(float16& a, const float16& b) {
DEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
__device__ inline float16& operator*=(float16& a, const float16& b) {
DEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
__device__ inline float16& operator/=(float16& a, const float16& b) {
DEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
__device__ inline bool operator==(const float16& a, const float16& b) {
DEVICE inline bool operator==(const float16& a, const float16& b) {
return __heq(half(a), half(b));
}
__device__ inline bool operator!=(const float16& a, const float16& b) {
DEVICE inline bool operator!=(const float16& a, const float16& b) {
return __hne(half(a), half(b));
}
__device__ inline bool operator<(const float16& a, const float16& b) {
DEVICE inline bool operator<(const float16& a, const float16& b) {
return __hlt(half(a), half(b));
}
__device__ inline bool operator<=(const float16& a, const float16& b) {
DEVICE inline bool operator<=(const float16& a, const float16& b) {
return __hle(half(a), half(b));
}
__device__ inline bool operator>(const float16& a, const float16& b) {
DEVICE inline bool operator>(const float16& a, const float16& b) {
return __hgt(half(a), half(b));
}
__device__ inline bool operator>=(const float16& a, const float16& b) {
DEVICE inline bool operator>=(const float16& a, const float16& b) {
return __hge(half(a), half(b));
}
*/
// Arithmetic operators on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16)
HOST inline float16 operator+(const float16& a, const float16& b) {
float16 res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fadd h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0", "v1");
return res;
}
HOST inline float16 operator-(const float16& a, const float16& b) {
float16 res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fsub h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0", "v1");
return res;
}
HOST inline float16 operator*(const float16& a, const float16& b) {
float16 res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fmul h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0", "v1");
return res;
}
HOST inline float16 operator/(const float16& a, const float16& b) {
float16 res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fdiv h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0", "v1");
return res;
}
// On ARMv8.2-A CPU
#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
(PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39)
__host__ inline float16 operator+(const float16& a, const float16& b) {
HOST inline float16 operator-(const float16& a) {
float16 res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"fneg h0, h0\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[res_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0");
return res;
}
HOST inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOST inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOST inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOST inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOST inline bool operator==(const float16& a, const float16& b) {
uint16_t res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fcmeq h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&res)
: // clobbers
"memory", "v0", "v1");
return (res & 0xffff) != 0;
}
HOST inline bool operator!=(const float16& a, const float16& b) {
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) {
uint16_t res;
asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n"
"ld1 {v0.h}[0], [%[b_ptr]]\n"
"fcmgt h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&res)
: // clobbers
"memory", "v0", "v1");
return (res & 0xffff) != 0;
}
HOST inline bool operator<=(const float16& a, const float16& b) {
uint16_t res;
asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n"
"ld1 {v0.h}[0], [%[b_ptr]]\n"
"fcmge h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&res)
: // clobbers
"memory", "v0", "v1");
return (res & 0xffff) != 0;
}
HOST inline bool operator>(const float16& a, const float16& b) {
uint16_t res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fcmgt h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&res)
: // clobbers
"memory", "v0", "v1");
return (res & 0xffff) != 0;
}
HOST inline bool operator>=(const float16& a, const float16& b) {
uint16_t res;
asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n"
"ld1 {v1.h}[0], [%[b_ptr]]\n"
"fcmge h0, h0, h1\n"
"st1 {v0.h}[0], [%[res_ptr]]\n"
: // outputs
: // inputs
[a_ptr] "r"(&(a.x)),
[b_ptr] "r"(&(b.x)),
[res_ptr] "r"(&res)
: // clobbers
"memory", "v0", "v1");
return (res & 0xffff) != 0;
}
/*
HOST inline float16 operator+(const float16& a, const float16& b) {
return float16(vaddh_f16(float16_t(a), float16_t(b)));
}
__host__ inline float16 operator-(const float16& a, const float16& b) {
HOST inline float16 operator-(const float16& a, const float16& b) {
return float16(vsubh_f16(float16_t(a), float16_t(b)));
}
__host__ inline float16 operator*(const float16& a, const float16& b) {
HOST inline float16 operator*(const float16& a, const float16& b) {
return float16(vmulh_f16(float16_t(a), float16_t(b)));
}
__host__ inline float16 operator/(const float16& a, const float16& b) {
HOST inline float16 operator/(const float16& a, const float16& b) {
return float16(vdivh_f16(float16_t(a), float16_t(b)));
}
__host__ inline float16 operator-(const float16& a) {
HOST inline float16 operator-(const float16& a) {
return float16(vnegh_f16(float16_t(a)));
}
__host__ inline float16& operator+=(float16& a, const float16& b) {
HOST inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
__host__ inline float16& operator-=(float16& a, const float16& b) {
HOST inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
__host__ inline float16& operator*=(float16& a, const float16& b) {
HOST inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
__host__ inline float16& operator/=(float16& a, const float16& b) {
HOST inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
__host__ inline bool operator==(const float16& a, const float16& b) {
HOST inline bool operator==(const float16& a, const float16& b) {
return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
}
__host__ inline bool operator!=(const float16& a, const float16& b) {
HOST inline bool operator!=(const float16& a, const float16& b) {
return !(a == b);
}
__host__ inline bool operator<(const float16& a, const float16& b) {
HOST inline bool operator<(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
#else
......@@ -468,7 +738,7 @@ __host__ inline bool operator<(const float16& a, const float16& b) {
#endif // PADDLE_NEON_64
}
__host__ inline bool operator<=(const float16& a, const float16& b) {
HOST inline bool operator<=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
#else
......@@ -476,7 +746,7 @@ __host__ inline bool operator<=(const float16& a, const float16& b) {
#endif // PADDLE_NEON_64
}
__host__ inline bool operator>(const float16& a, const float16& b) {
HOST inline bool operator>(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
#else
......@@ -484,231 +754,82 @@ __host__ inline bool operator>(const float16& a, const float16& b) {
#endif // PADDLE_NEON_64
}
__host__ inline bool operator>=(const float16& a, const float16& b) {
HOST inline bool operator>=(const float16& a, const float16& b) {
#ifdef PADDLE_NEON_64
return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
#else
return float(a) >= float(b);
#endif // PADDLE_NEON_64
}
*/
#else // Software emulation on other cpu
PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
// Arithmetic operators, software emulated on other CPU
#else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
return float16(float(a) + float(b));
}
PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
return float16(float(a) - float(b));
}
PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
return float16(float(a) * float(b));
}
PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
return float16(float(a) / float(b));
}
PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) {
HOSTDEVICE inline float16 operator-(const float16& a) {
float16 res;
res.x = a.x ^ 0x8000;
return res;
}
PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
a = float16(float(a) + float(b));
return a;
}
PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
a = float16(float(a) - float(b));
return a;
}
PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
a = float16(float(a) * float(b));
return a;
}
PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
a = float16(float(a) / float(b));
return a;
}
PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
return float(a) == float(b);
}
PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
return float(a) != float(b);
}
PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
return float(a) < float(b);
}
PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
return float(a) <= float(b);
}
PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
return float(a) > float(b);
}
PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
return float(a) >= float(b);
}
#endif
namespace fp16_impl {
union Bits {
float f;
int32_t si;
uint32_t ui;
};
const int shift = 13;
const int shiftSign = 16;
const int32_t infN = 0x7F800000;
const int32_t maxN = 0x477FE000; // max flt16 as flt32
const int32_t minN = 0x38800000; // min flt16 normal as flt32
const int32_t sigN = 0x80000000; // sign bit
constexpr int32_t infC = infN >> shift;
constexpr int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as float32
constexpr int32_t maxC = maxN >> shift;
constexpr int32_t minC = minN >> shift;
constexpr int32_t sigC = sigN >> shiftSign;
const int32_t mulN = 0x52000000; // (1 << 23) / minN
const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift))
const int32_t subC = 0x003FF; // max flt32 subnormal downshifted
const int32_t norC = 0x00400; // min flt32 normal downshifted
constexpr int32_t maxD = infC - maxC - 1;
constexpr int32_t minD = minC - subC - 1;
PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
half tmp = __float2half(f);
return *reinterpret_cast<float16*>(&tmp);
#elif defined(PADDLE_NEON_64)
float16 res;
asm volatile(
"ld1 {v0.s}[0], [%[float_ptr]]\n"
"fcvt h0, s0\n"
"st1 {v0.h}[0], [%[half_ptr]]\n"
: // outputs
: // inputs
[float_ptr] "r"(&f),
[half_ptr] "r"(&(res.x))
: // clobbers
"memory", "v0");
return res;
#elif defined(PADDLE_NEON_32)
float16 res;
asm volatile(
"vld1.32 {d0[0]}, [%[float_ptr]]\n"
"vcvt.f16.f32 d0, q0\n"
"vst1.16 {d0[0]}, [%[half_ptr]]\n"
: // outputs
: // inputs
[float_ptr] "r"(&f),
[half_ptr] "r"(&(res.x))
: // clobbers
"memory", "d0");
return res;
#elif defined(__F16C__)
float16 res;
res.x = _cvtss_sh(f, 0);
return res;
#else
// Conversion routine adapted from
// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
Bits v, s;
v.f = f;
uint32_t sign = v.si & sigN;
v.si ^= sign;
sign >>= shiftSign; // logical shift
s.si = mulN;
s.si = s.f * v.f; // correct subnormals
v.si ^= (s.si ^ v.si) & -(minN > v.si);
v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
v.ui >>= shift; // logical shift
v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
float16 res;
res.x = v.ui | sign;
return res;
#endif
}
PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
half tmp = *reinterpret_cast<half*>(&h);
return __half2float(tmp);
#elif defined(PADDLE_NEON_64)
float res;
asm volatile(
"ld1 {v0.h}[0], [%[half_ptr]]\n"
"fcvt s0, h0\n"
"st1 {v0.s}[0], [%[float_ptr]]\n"
: // outputs
: // inputs
[half_ptr] "r"(&(h.x)),
[float_ptr] "r"(&res)
: // clobbers
"memory", "v0");
return res;
#elif defined(PADDLE_NEON_32)
float res;
asm volatile(
"vld1.16 {d0[0]}, [%[half_ptr]]\n"
"vcvt.f32.f16 q0, d0\n"
"vst1.32 {d0[0]}, [%[float_ptr]]\n"
: // outputs
: // inputs
[half_ptr] "r"(&(h.x)),
[float_ptr] "r"(&res)
: // clobbers
"memory", "v0");
return res;
#elif defined(__F16C__)
return _cvtsh_ss(h.x);
#else
// Conversion routine adapted from
// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
Bits v;
v.ui = h.x;
int32_t sign = v.si & sigC;
v.si ^= sign;
sign <<= shiftSign;
v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
Bits s;
s.si = mulC;
s.f *= v.si;
int32_t mask = -(norC > v.si);
v.si <<= shift;
v.si ^= (s.si ^ v.si) & mask;
v.si |= sign;
return v.f;
#endif
}
} // namespace fp16_impl
} // namespace paddle
......@@ -55,10 +55,12 @@ TEST(float16, conversion_cpu) {
EXPECT_EQ(float16(false).x, 0x0000);
// Implicit conversion to and from Eigen::half
/*
Eigen::half tmp = float16(1.0f);
float16 v_conv = tmp;
EXPECT_EQ(tmp.x, 0x3c00);
EXPECT_EQ(v_conv.x, 0x3c00);
*/
// Default constructor
float16 v_def;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册