add float16 arithmetic on arm cpu

e877cdb8 · Kexin Zhao · 9d8b3059 · e877cdb8
隐藏空白更改
内联并排

Showing with 389 addition and 90 deletion

paddle/math/float16.h paddle/math/float16.h +389 -90

未找到文件。
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+// need to define PADDLE_ARM_FP16
+
 #pragma once

 #include <cstdint>
@@ -24,6 +26,18 @@ limitations under the License. */
 #include "Eigen/src/Core/arch/CUDA/Half.h"
 #endif

+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
 #ifdef __CUDACC__
 #define PADDLE_HOSTDEVICE __host__ __device__
 #if CUDA_VERSION >= 7050
@@ -48,6 +62,7 @@ limitations under the License. */

 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #define PADDLE_NEON
+#include <arm_neon.h>
 #endif

 #if defined(PADDLE_NEON) && defined(PADDLE_ARM_32)
@@ -58,26 +73,16 @@ limitations under the License. */
 #define PADDLE_NEON_64
 #endif

-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-#include <arm_neon.h>
-#endif
-
-#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM)
-#include <immintrin.h>
-#else
+#ifdef PADDLE_ARM
 #ifdef __F16C__
 #undef __F16C__
-#endif
-#endif
+#endif  // __F16C__
+#else
+#include <immintrin.h>
+#endif  // PADDLE_ARM

 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))

-// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h
-template <typename To, typename From>
-To convert(From f) {
-  return static_cast<To>(f);
-}
-
 namespace paddle {

 struct float16;
@@ -86,13 +91,12 @@ namespace fp16_impl {
 // convert from float to half precision in round-to-nearest-even mode
 PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f);
 PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
-PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x);
 }  // namespace fp16_impl

 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
-// with CUDA half and Eigen::half data types.
+// with CUDA half, ARM float16_t, and Eigen::half data types.
 struct PADDLE_ALIGN(2) float16 {
  uint16_t x;

@@ -103,7 +107,7 @@ struct PADDLE_ALIGN(2) float16 {
  PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}

 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16(const half h) {
+  PADDLE_HOSTDEVICE inline float16(const half& h) {
 #if CUDA_VERSION >= 9000
    x = reinterpret_cast<__half_raw*>(&h)->x;
 #else
@@ -111,40 +115,72 @@ struct PADDLE_ALIGN(2) float16 {
 #endif  // CUDA_VERSION >= 9000
  }
 #endif  // PADDLE_CUDA_FP16
-/*
-#ifdef PADDLE_CUDA_FP16
-  #if CUDA_VERSION < 9000
-  PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {}
-  #else
-  PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {}
-  PADDLE_HOSTDEVICE inline float16(const half& h)
-    : x(*reinterpret_cast<uint16_t*>(&h)) {}
-  #endif // CUDA_VERSION < 9000
-#endif // PADDLE_CUDA_FP16
-*/

 #ifdef USE_EIGEN
  PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
 #endif  // USE_EIGEN

-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
+#ifdef PADDLE_NEON
  // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16 in arm_fp16.h
-  // which is included in arm_neon.h
-  PADDLE_HOSTDEVICE inline float16(const float16_t h) {
-    x = *reinterpret_cast<uint16_t*>(&h);
+  // float16_t is an alias for __fp16 in arm_fp16.h,
+  // which is included in arm_neon.h.
+  // According to gcc, __fp16 can only be used as an argument to fp16
+  // intrinsic defined in arm_neon.h or as a storage type. It cannot
+  // be used as a formal function argument.
+  // TODO (kexinzhao): test it on RPI
+  PADDLE_HOSTDEVICE inline float16(const float16_t* h) {
+    x = *reinterpret_cast<uint16_t*>(h);
  }
 #endif

  PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}

+  PADDLE_HOSTDEVICE inline explicit float16(int8_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(int64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
+  PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+  }
+
  PADDLE_HOSTDEVICE inline explicit float16(float val) {
    float16 res = fp16_impl::float_to_half_rn(val);
    x = res.x;
  }

-  template <class T>
-  PADDLE_HOSTDEVICE inline explicit float16(const T& val) {
+  PADDLE_HOSTDEVICE inline explicit float16(double val) {
    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
    x = res.x;
  }
@@ -155,7 +191,7 @@ struct PADDLE_ALIGN(2) float16 {
  }

 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) {
+  PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
    x = reinterpret_cast<__half_raw*>(&rhs)->x;
 #else
@@ -172,27 +208,80 @@ struct PADDLE_ALIGN(2) float16 {
  }
 #endif  // USE_EIGEN

-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) {
-    x = *reinterpret_cast<uint16_t*>(&rhs);
+#ifdef PADDLE_NEON
+  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
+    x = *reinterpret_cast<uint16_t*>(rhs);
    return *this;
  }
 #endif

-/*
-  PADDLE_HOSTDEVICE inline explicit float16(int val) {
+  PADDLE_HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) {
    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
    x = res.x;
+    return *this;
  }

-  PADDLE_HOSTDEVICE inline explicit float16(double val) {
+  PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) {
    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(float val) {
+    float16 res = fp16_impl::float_to_half_rn(val);
+    x = res.x;
+    return *this;
+  }
+
+  PADDLE_HOSTDEVICE inline float16& operator=(double val) {
+    float16 res = fp16_impl::float_to_half_rn(static_cast<float>(val));
+    x = res.x;
+    return *this;
  }
-*/

 #ifdef PADDLE_CUDA_FP16
-  PADDLE_HOSTDEVICE inline operator half() {
+  PADDLE_HOSTDEVICE inline operator half() const {
 #if CUDA_VERSION >= 9000
    __half_raw h;
    h.x = x;
@@ -206,82 +295,270 @@ struct PADDLE_ALIGN(2) float16 {
 #endif  // PADDLE_CUDA_FP16

 #ifdef USE_EIGEN
-  PADDLE_HOSTDEVICE inline operator Eigen::half() {
+  PADDLE_HOSTDEVICE inline operator Eigen::half() const {
    Eigen::half h;
    h.x = x;
    return h;
  }
 #endif  // USE_EIGEN

-#if defined(PADDLE_ARM) && defined(PADDLE_NEON)
-  PADDLE_HOSTDEVICE inline operator float16_t() {
+#ifdef PADDLE_NEON
+  // check whether it works or not
+  PADDLE_HOSTDEVICE inline operator float16_t() const {
    float16 h = *this;
    return *reinterpret_cast<float16_t*>(&h);
  }
 #endif

-  PADDLE_HOSTDEVICE inline explicit operator bool() {
+  PADDLE_HOSTDEVICE inline explicit operator bool() const {
    return (x & 0x7fff) != 0;
  }

-  PADDLE_HOSTDEVICE inline explicit operator int8_t() {
-    return static_cat<int8_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator uint8_t() {
-    return static_cat<uint8_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator int16_t() {
-    return static_cat<int16_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator uint16_t() {
-    return static_cat<uint16_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator int32_t() {
-    return static_cat<int32_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator uint32_t() {
-    return static_cat<uint32_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator int64_t() {
-    return static_cat<int64_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator uint64_t() {
-    return static_cat<uint64_t>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(fp16_impl::half_to_float(*this));
  }

-  PADDLE_HOSTDEVICE inline explicit operator float() {
+  PADDLE_HOSTDEVICE inline explicit operator float() const {
    return fp16_impl::half_to_float(*this);
  }

-  PADDLE_HOSTDEVICE inline explicit operator double() {
-    return static_cat<double>(fp16_impl::half_to_float(*this));
+  PADDLE_HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(fp16_impl::half_to_float(*this));
  }
 };

 // arithmetic operators
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
 __device__ inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(a, b));
+  return float16(__hadd(half(a), half(b)));
 }

 __device__ inline float16 operator-(const float16& a, const float16& b) {
-  return __hsub(a, b);
+  return float16(__hsub(half(a), half(b)));
 }

 __device__ inline float16 operator*(const float16& a, const float16& b) {
-  return __hmul(a, b);
+  return float16(__hmul(half(a), half(b)));
 }

-#elif  // on arm cpu
+__device__ inline float16 operator/(const float16& a, const float16& b) {
+  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
+  // instinsic
+  float num = __half2float(half(a));
+  float denom = __half2float(half(b));
+  return float16(num / denom);
+}

-#else
+__device__ inline float16 operator-(const float16& a) {
+  return float16(__hneg(half(a)));
+}
+
+__device__ inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+__device__ inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+__device__ inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+__device__ inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+__device__ inline bool operator==(const float16& a, const float16& b) {
+  return __heq(half(a), half(b));
+}
+
+__device__ inline bool operator!=(const float16& a, const float16& b) {
+  return __hne(half(a), half(b));
+}
+
+__device__ inline bool operator<(const float16& a, const float16& b) {
+  return __hlt(half(a), half(b));
+}
+
+__device__ inline bool operator<=(const float16& a, const float16& b) {
+  return __hle(half(a), half(b));
+}
+
+__device__ inline bool operator>(const float16& a, const float16& b) {
+  return __hgt(half(a), half(b));
+}
+
+__device__ inline bool operator>=(const float16& a, const float16& b) {
+  return __hge(half(a), half(b));
+}
+
+// On ARMv8.2-A CPU
+#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \
+    defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16)
+__host__ inline float16 operator+(const float16& a, const float16& b) {
+  return float16(vaddh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator-(const float16& a, const float16& b) {
+  return float16(vsubh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator*(const float16& a, const float16& b) {
+  return float16(vmulh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator/(const float16& a, const float16& b) {
+  return float16(vdivh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline float16 operator-(const float16& a) {
+  return float16(vnegh_f16(float16_t(a)));
+}
+
+__host__ inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+__host__ inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+__host__ inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+__host__ inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+__host__ inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+// compare only available in NEON_64
+__host__ inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
+}
+
+__host__ inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
+}
+
+#else  // software emulation on other cpu
+PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}

 #endif

@@ -320,16 +597,11 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
  half tmp = __float2half(f);
  return *reinterpret_cast<float16*>(&(tmp));

-#elif defined(__F16C__)
-  float16 res;
-  res.x = _cvtss_sh(f, 0);
-  return res;
-
-#elif defined(PADDLE_ARM_64)  // test on RPI
+#elif defined(PADDLE_NEON_64)  // test on RPI
  float16 res;
  asm volatile(
      "ld1 {v0.s}[0], [%[float_ptr]]\n"
-      "FCVT h0, s0\n"
+      "fcvt h0, s0\n"
      "st1 {v0.h}[0], [%[half_ptr]]\n"
      :  // outputs
      :  // inputs
@@ -339,6 +611,25 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
      "memory", "v0");
  return res;

+#elif defined(PADDLE_NEON_32)  // test on RPI
+  float16 res;
+  asm volatile(
+      "vld1.32 {d0[0]}, [%[float_ptr]]\n"
+      "vcvt.f16.f32 d0, q0\n"
+      "vst1.16 {d0[0]}, [%[half_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [float_ptr] "r"(&f),
+      [half_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "d0");
+  return res;
+
+#elif defined(__F16C__)
+  float16 res;
+  res.x = _cvtss_sh(f, 0);
+  return res;
+
 #else
  // Conversion routine adapted from
  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
@@ -367,10 +658,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
  half tmp = *reinterpret_cast<half*>(&h);
  return __half2float(h);

-#elif defined(__F16C__)
-  return _cvtsh_ss(h.x);
-
-#elif defined(PADDLE_ARM_64)  // test on RPI
+#elif defined(PADDLE_NEON_64)
  float res;
  asm volatile(
      "ld1 {v0.h}[0], [%[half_ptr]]\n"
@@ -384,6 +672,23 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
      "memory", "v0");
  return res;

+#elif defined(PADDLE_NEON_32)
+  float res;
+  asm volatile(
+      "vld1.16 {d0[0]}, [%[half_ptr]]\n"
+      "vcvt.f32.f16 q0, d0\n"
+      "vst1.32 {d0[0]}, [%[float_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [half_ptr] "r"(&(h.x)),
+      [float_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0");
+  return res;
+
+#elif defined(__F16C__)
+  return _cvtsh_ss(h.x);
+
 #else
  // Conversion routine adapted from
  // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
@@ -406,12 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) {
 #endif
 }

-PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) {
-  float16 res;
-  res.x = x;
-  return res;
-}
-
 }  // namespace half_impl

 }  // namespace paddle