From a5feb771592d1bd7340ff7132518d6c52829b8e7 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Mon, 27 Nov 2017 17:12:21 -0800
Subject: [PATCH] address pr comment

---
 paddle/math/float16.h              | 839 +++++++++++++++++------------
 paddle/math/tests/test_float16.cpp |   2 +
 2 files changed, 482 insertions(+), 359 deletions(-)
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
index 3b2217414..65c0489e1 100644
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -16,9 +16,14 @@ limitations under the License. */
 
 #include <cstdint>
 
+#ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "paddle/platform/hostdevice.h"
+
 #ifdef __GNUC__
 #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
 #else
@@ -31,25 +36,12 @@ limitations under the License. */
 #define PADDLE_CLANG_VER 0
 #endif  // __clang__
 
-#ifdef __CUDACC__
-#define PADDLE_HOSTDEVICE __host__ __device__
-#if CUDA_VERSION >= 7050
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
 #define PADDLE_CUDA_FP16
 #include <cuda_fp16.h>
-#endif  // CUDA_VERSION >= 7050
-#else
-#define PADDLE_HOSTDEVICE
-#endif  // __CUDACC__
-
-#ifdef __arm__
-#define PADDLE_ARM_32
 #endif
 
-#ifdef __aarch64__
-#define PADDLE_ARM_64
-#endif
-
-#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64)
+#if defined(__arm__) || defined(__aarch64__)
 #define PADDLE_ARM
 #endif
 
@@ -58,19 +50,12 @@ limitations under the License. */
 #include <arm_neon.h>
 #endif
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
-#define PADDLE_NEON_32
-#endif
-
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64)
-#define PADDLE_NEON_64
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
 #endif
 
-#ifdef PADDLE_ARM
-#ifdef __F16C__
-#undef __F16C__
-#endif  // __F16C__
-#else
+#ifndef PADDLE_ARM
 #include <immintrin.h>
 #endif  // PADDLE_ARM
 
@@ -78,27 +63,20 @@ limitations under the License. */
 
 namespace paddle {
 
-struct float16;
-
-namespace fp16_impl {
-// Convert from float to half precision in round-to-nearest-even mode
-PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
-PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
-}  // namespace fp16_impl
-
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
 // with CUDA half, ARM float16_t, and Eigen::half data types.
 struct PADDLE_ALIGN(2) float16 {
+public:
   uint16_t x;
 
-  PADDLE_HOSTDEVICE inline float16() : x(0) {}
+  HOSTDEVICE inline float16() : x(0) {}
 
-  PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16(const half& h) {
+  HOSTDEVICE inline explicit float16(const half& h) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&h)->x;
 #else
@@ -107,78 +85,64 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
+#ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16 in arm_fp16.h,
   // which is included in arm_neon.h.
-  PADDLE_HOSTDEVICE inline float16(const float16_t& h) {
-    float16_t tmp = h;
-    x = *reinterpret_cast<uint16_t*>(&tmp);
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
-
-  PADDLE_HOSTDEVICE inline explicit float16(int8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(int16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
-
-  PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 
-  PADDLE_HOSTDEVICE inline explicit float16(int32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#elif defined(PADDLE_NEON)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
 
-  PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
 
-  PADDLE_HOSTDEVICE inline explicit float16(int64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
 
-  PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+#endif
   }
 
-  PADDLE_HOSTDEVICE inline explicit float16(float val) {
-    float16 res = fp16_impl::float_to_half_rn(val);
-    x = res.x;
-  }
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
 
-  PADDLE_HOSTDEVICE inline explicit float16(double val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
-  }
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
 
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) {
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
     x = rhs.x;
     return *this;
   }
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) {
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(&rhs)->x;
 #else
@@ -188,87 +152,75 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
     x = rhs.x;
     return *this;
   }
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
-    float16_t tmp = rhs;
-    x = *reinterpret_cast<uint16_t*>(&tmp);
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
     return *this;
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline float16& operator=(bool b) {
+  HOSTDEVICE inline float16& operator=(bool b) {
     x = b ? 0x3c00 : 0;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(float val) {
-    float16 res = fp16_impl::float_to_half_rn(val);
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
     return *this;
   }
 
-  PADDLE_HOSTDEVICE inline float16& operator=(double val) {
-    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
-    x = res.x;
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
     return *this;
   }
 
 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline operator half() const {
+  HOSTDEVICE inline explicit operator half() const {
 #if CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
@@ -281,186 +233,504 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  PADDLE_HOSTDEVICE inline operator Eigen::half() const {
+  HOSTDEVICE inline explicit operator Eigen::half() const {
     Eigen::half h;
     h.x = x;
     return h;
   }
 
-#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34)
-  PADDLE_HOSTDEVICE inline operator float16_t() const {
-    float16 h = *this;
-    return *reinterpret_cast<float16_t*>(&h);
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
   }
 #endif
 
-  PADDLE_HOSTDEVICE inline explicit operator bool() const {
-    return (x & 0x7fff) != 0;
-  }
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_NEON)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
 
-  PADDLE_HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(fp16_impl::half_to_float(*this));
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator float() const {
-    return fp16_impl::half_to_float(*this);
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
   }
 
-  PADDLE_HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(fp16_impl::half_to_float(*this));
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
   }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
 };
 
-// Arithmetic operators
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-__device__ inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+    __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
+DEVICE inline half operator+(const half& a, const half& b) {
+  return __hadd(a, b);
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+  return __hsub(a, b);
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+  return __hmul(a, b);
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+}
+
+DEVICE inline half operator-(const half& a) { return __hneg(a); }
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+  return __heq(a, b);
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+  return __hne(a, b);
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+  return __hlt(a, b);
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+  return __hle(a, b);
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+  return __hgt(a, b);
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+  return __hge(a, b);
+}
+
+/*
+DEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(__hadd(half(a), half(b)));
 }
 
-__device__ inline float16 operator-(const float16& a, const float16& b) {
+DEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(__hsub(half(a), half(b)));
 }
 
-__device__ inline float16 operator*(const float16& a, const float16& b) {
+DEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(__hmul(half(a), half(b)));
 }
 
-__device__ inline float16 operator/(const float16& a, const float16& b) {
-  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
+DEVICE inline float16 operator/(const float16& a, const float16& b) {
   float num = __half2float(half(a));
   float denom = __half2float(half(b));
   return float16(num / denom);
 }
 
-__device__ inline float16 operator-(const float16& a) {
+DEVICE inline float16 operator-(const float16& a) {
   return float16(__hneg(half(a)));
 }
 
-__device__ inline float16& operator+=(float16& a, const float16& b) {
+DEVICE inline float16& operator+=(float16& a, const float16& b) {
   a = a + b;
   return a;
 }
 
-__device__ inline float16& operator-=(float16& a, const float16& b) {
+DEVICE inline float16& operator-=(float16& a, const float16& b) {
   a = a - b;
   return a;
 }
 
-__device__ inline float16& operator*=(float16& a, const float16& b) {
+DEVICE inline float16& operator*=(float16& a, const float16& b) {
   a = a * b;
   return a;
 }
 
-__device__ inline float16& operator/=(float16& a, const float16& b) {
+DEVICE inline float16& operator/=(float16& a, const float16& b) {
   a = a / b;
   return a;
 }
 
-__device__ inline bool operator==(const float16& a, const float16& b) {
+DEVICE inline bool operator==(const float16& a, const float16& b) {
   return __heq(half(a), half(b));
 }
 
-__device__ inline bool operator!=(const float16& a, const float16& b) {
+DEVICE inline bool operator!=(const float16& a, const float16& b) {
   return __hne(half(a), half(b));
 }
 
-__device__ inline bool operator<(const float16& a, const float16& b) {
+DEVICE inline bool operator<(const float16& a, const float16& b) {
   return __hlt(half(a), half(b));
 }
 
-__device__ inline bool operator<=(const float16& a, const float16& b) {
+DEVICE inline bool operator<=(const float16& a, const float16& b) {
   return __hle(half(a), half(b));
 }
 
-__device__ inline bool operator>(const float16& a, const float16& b) {
+DEVICE inline bool operator>(const float16& a, const float16& b) {
   return __hgt(half(a), half(b));
 }
 
-__device__ inline bool operator>=(const float16& a, const float16& b) {
+DEVICE inline bool operator>=(const float16& a, const float16& b) {
   return __hge(half(a), half(b));
 }
+*/
+
+// Arithmetic operators on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
 
-// On ARMv8.2-A CPU
-#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
-    (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39)
-__host__ inline float16 operator+(const float16& a, const float16& b) {
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+/*
+HOST inline float16 operator+(const float16& a, const float16& b) {
   return float16(vaddh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator-(const float16& a, const float16& b) {
+HOST inline float16 operator-(const float16& a, const float16& b) {
   return float16(vsubh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator*(const float16& a, const float16& b) {
+HOST inline float16 operator*(const float16& a, const float16& b) {
   return float16(vmulh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator/(const float16& a, const float16& b) {
+HOST inline float16 operator/(const float16& a, const float16& b) {
   return float16(vdivh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline float16 operator-(const float16& a) {
+HOST inline float16 operator-(const float16& a) {
   return float16(vnegh_f16(float16_t(a)));
 }
 
-__host__ inline float16& operator+=(float16& a, const float16& b) {
+HOST inline float16& operator+=(float16& a, const float16& b) {
   a = a + b;
   return a;
 }
 
-__host__ inline float16& operator-=(float16& a, const float16& b) {
+HOST inline float16& operator-=(float16& a, const float16& b) {
   a = a - b;
   return a;
 }
 
-__host__ inline float16& operator*=(float16& a, const float16& b) {
+HOST inline float16& operator*=(float16& a, const float16& b) {
   a = a * b;
   return a;
 }
 
-__host__ inline float16& operator/=(float16& a, const float16& b) {
+HOST inline float16& operator/=(float16& a, const float16& b) {
   a = a / b;
   return a;
 }
 
-__host__ inline bool operator==(const float16& a, const float16& b) {
+HOST inline bool operator==(const float16& a, const float16& b) {
   return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
 }
 
-__host__ inline bool operator!=(const float16& a, const float16& b) {
+HOST inline bool operator!=(const float16& a, const float16& b) {
   return !(a == b);
 }
 
-__host__ inline bool operator<(const float16& a, const float16& b) {
+HOST inline bool operator<(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
 #else
@@ -468,7 +738,7 @@ __host__ inline bool operator<(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator<=(const float16& a, const float16& b) {
+HOST inline bool operator<=(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
 #else
@@ -476,7 +746,7 @@ __host__ inline bool operator<=(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator>(const float16& a, const float16& b) {
+HOST inline bool operator>(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
 #else
@@ -484,231 +754,82 @@ __host__ inline bool operator>(const float16& a, const float16& b) {
 #endif  // PADDLE_NEON_64
 }
 
-__host__ inline bool operator>=(const float16& a, const float16& b) {
+HOST inline bool operator>=(const float16& a, const float16& b) {
 #ifdef PADDLE_NEON_64
   return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
 #else
   return float(a) >= float(b);
 #endif  // PADDLE_NEON_64
 }
+*/
 
-#else  // Software emulation on other cpu
-PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(float(a) + float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(float(a) - float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(float(a) * float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
   return float16(float(a) / float(b));
 }
 
-PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) {
+HOSTDEVICE inline float16 operator-(const float16& a) {
   float16 res;
   res.x = a.x ^ 0x8000;
   return res;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
   a = float16(float(a) + float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
   a = float16(float(a) - float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
   a = float16(float(a) * float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
   a = float16(float(a) / float(b));
   return a;
 }
 
-PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
   return float(a) == float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
   return float(a) != float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
   return float(a) < float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
   return float(a) <= float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
   return float(a) > float(b);
 }
 
-PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
   return float(a) >= float(b);
 }
 
 #endif
-
-namespace fp16_impl {
-
-union Bits {
-  float f;
-  int32_t si;
-  uint32_t ui;
-};
-
-const int shift = 13;
-const int shiftSign = 16;
-
-const int32_t infN = 0x7F800000;
-const int32_t maxN = 0x477FE000;  // max flt16 as flt32
-const int32_t minN = 0x38800000;  // min flt16 normal as flt32
-const int32_t sigN = 0x80000000;  // sign bit
-
-constexpr int32_t infC = infN >> shift;
-constexpr int32_t nanN = (infC + 1) << shift;  // minimum flt16 nan as float32
-constexpr int32_t maxC = maxN >> shift;
-constexpr int32_t minC = minN >> shift;
-constexpr int32_t sigC = sigN >> shiftSign;
-
-const int32_t mulN = 0x52000000;  // (1 << 23) / minN
-const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
-const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
-const int32_t norC = 0x00400;     // min flt32 normal downshifted
-
-constexpr int32_t maxD = infC - maxC - 1;
-constexpr int32_t minD = minC - subC - 1;
-
-PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  half tmp = __float2half(f);
-  return *reinterpret_cast<float16*>(&tmp);
-
-#elif defined(PADDLE_NEON_64)
-  float16 res;
-  asm volatile(
-      "ld1 {v0.s}[0], [%[float_ptr]]\n"
-      "fcvt h0, s0\n"
-      "st1 {v0.h}[0], [%[half_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [float_ptr] "r"(&f),
-      [half_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(PADDLE_NEON_32)
-  float16 res;
-  asm volatile(
-      "vld1.32 {d0[0]}, [%[float_ptr]]\n"
-      "vcvt.f16.f32 d0, q0\n"
-      "vst1.16 {d0[0]}, [%[half_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [float_ptr] "r"(&f),
-      [half_ptr] "r"(&(res.x))
-      :  // clobbers
-      "memory", "d0");
-  return res;
-
-#elif defined(__F16C__)
-  float16 res;
-  res.x = _cvtss_sh(f, 0);
-  return res;
-
-#else
-  // Conversion routine adapted from
-  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-  Bits v, s;
-  v.f = f;
-  uint32_t sign = v.si & sigN;
-  v.si ^= sign;
-  sign >>= shiftSign;  // logical shift
-  s.si = mulN;
-  s.si = s.f * v.f;  // correct subnormals
-  v.si ^= (s.si ^ v.si) & -(minN > v.si);
-  v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
-  v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
-  v.ui >>= shift;  // logical shift
-  v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
-  v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
-  float16 res;
-  res.x = v.ui | sign;
-  return res;
-
-#endif
-}
-
-PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
-  half tmp = *reinterpret_cast<half*>(&h);
-  return __half2float(tmp);
-
-#elif defined(PADDLE_NEON_64)
-  float res;
-  asm volatile(
-      "ld1 {v0.h}[0], [%[half_ptr]]\n"
-      "fcvt s0, h0\n"
-      "st1 {v0.s}[0], [%[float_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [half_ptr] "r"(&(h.x)),
-      [float_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(PADDLE_NEON_32)
-  float res;
-  asm volatile(
-      "vld1.16 {d0[0]}, [%[half_ptr]]\n"
-      "vcvt.f32.f16 q0, d0\n"
-      "vst1.32 {d0[0]}, [%[float_ptr]]\n"
-      :  // outputs
-      :  // inputs
-      [half_ptr] "r"(&(h.x)),
-      [float_ptr] "r"(&res)
-      :  // clobbers
-      "memory", "v0");
-  return res;
-
-#elif defined(__F16C__)
-  return _cvtsh_ss(h.x);
-
-#else
-  // Conversion routine adapted from
-  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-  Bits v;
-  v.ui = h.x;
-  int32_t sign = v.si & sigC;
-  v.si ^= sign;
-  sign <<= shiftSign;
-  v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
-  v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
-  Bits s;
-  s.si = mulC;
-  s.f *= v.si;
-  int32_t mask = -(norC > v.si);
-  v.si <<= shift;
-  v.si ^= (s.si ^ v.si) & mask;
-  v.si |= sign;
-  return v.f;
-
-#endif
-}
-
-}  // namespace fp16_impl
 }  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
index 8c74bcc03..f5541d8f0 100644
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -55,10 +55,12 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(float16(false).x, 0x0000);
 
   // Implicit conversion to and from Eigen::half
+  /*
   Eigen::half tmp = float16(1.0f);
   float16 v_conv = tmp;
   EXPECT_EQ(tmp.x, 0x3c00);
   EXPECT_EQ(v_conv.x, 0x3c00);
+  */
 
   // Default constructor
   float16 v_def;
-- 
GitLab