From 783759907592180e6fddff295add50dec3d9403a Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Mon, 27 Dec 2021 14:19:10 +0800
Subject: [PATCH] refine float16 implementation (#38439)

---
 paddle/fluid/operators/clip_op.h              |  2 +-
 .../device/gpu/cuda/cuda_device_function.h    |  4 +-
 paddle/fluid/platform/float16.h               | 66 +++++++++----------
 paddle/fluid/platform/float16_test.cu         | 42 ++++++------
 4 files changed, 57 insertions(+), 57 deletions(-)
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index abf721936b4..116edc0390e 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -95,7 +95,7 @@ class ClipKernel : public framework::OpKernel<T> {
                       platform::errors::InvalidArgument(
                           "max should be greater than or equal to min. "
                           "But received min = %f, max = %f",
-                          min, max));
+                          static_cast<float>(min), static_cast<float>(max)));
 
     auto* x_var = context.InputVar("X");
     if (x_var->IsType<framework::LoDTensor>()) {
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index e7d80757395..7fe2367b551 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -73,7 +73,7 @@ template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
                                                        int width) {
-  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+  return float16(__shfl_down_sync(mask, val.to_half(),
                                   static_cast<unsigned>(delta), width));
 }
 
@@ -103,7 +103,7 @@ CudaShuffleDownSync(unsigned mask, paddle::platform::complex<double> val,
 template <>
 __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask,
                                                       float16 val, int width) {
-  return float16(__shfl_xor_sync(mask, static_cast<half>(val), width));
+  return float16(__shfl_xor_sync(mask, val.to_half(), width));
 }
 
 template <>
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index bdd4d54b3d1..b6d088421af 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -214,7 +214,7 @@ struct PADDLE_ALIGN(2) float16 {
 
 // Conversion opertors
 #ifdef PADDLE_CUDA_FP16
-  HOSTDEVICE inline explicit operator half() const {
+  HOSTDEVICE inline half to_half() const {
 #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
@@ -233,7 +233,7 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  HOSTDEVICE inline explicit operator float() const {
+  HOSTDEVICE inline operator float() const {
 #if defined(PADDLE_CUDA_FP16) && \
     (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
@@ -302,7 +302,7 @@ struct PADDLE_ALIGN(2) float16 {
     return static_cast<uint64_t>(static_cast<float>(*this));
   }
 
-  HOSTDEVICE inline explicit operator double() const {
+  HOSTDEVICE inline operator double() const {
     return static_cast<double>(static_cast<float>(*this));
   }
 
@@ -350,7 +350,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
   return __hadd(a, b);
 #else
   float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -359,7 +359,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
   return __hsub(a, b);
 #else
   float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -368,7 +368,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
   return __hmul(a, b);
 #else
   float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -379,7 +379,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
   return __float2half(num / denom);
 #else
   float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -388,7 +388,7 @@ DEVICE inline half operator-(const half& a) {
   return __hneg(a);
 #else
   float res = -static_cast<float>(float16(a));
-  return half(float16(res));
+  return float16(res).to_half();
 #endif
 }
 
@@ -470,7 +470,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 // in __host__ __device__ function
 #if defined(__HIPCC__)
 DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(half(a), half(b)));
+  return float16(__hadd(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator+(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) + static_cast<float>(b));
@@ -478,7 +478,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hadd(half(a), half(b)));
+  return float16(__hadd(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) + static_cast<float>(b));
 #endif
@@ -487,7 +487,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(half(a), half(b)));
+  return float16(__hsub(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator-(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) - static_cast<float>(b));
@@ -495,7 +495,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hsub(half(a), half(b)));
+  return float16(__hsub(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) - static_cast<float>(b));
 #endif
@@ -504,7 +504,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(half(a), half(b)));
+  return float16(__hmul(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator*(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) * static_cast<float>(b));
@@ -512,7 +512,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
 #else
 HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hmul(half(a), half(b)));
+  return float16(__hmul(a.to_half(), b.to_half()));
 #else
   return float16(static_cast<float>(a) * static_cast<float>(b));
 #endif
@@ -521,7 +521,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  return float16(__hdiv(half(a), half(b)));
+  return float16(__hdiv(a.to_half(), b.to_half()));
 }
 HOST inline float16 operator/(const float16& a, const float16& b) {
   return float16(static_cast<float>(a) / static_cast<float>(b));
@@ -530,8 +530,8 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
 HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   // TODO(kexinzhao): check which cuda version starts to support __hdiv
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
+  float num = __half2float(a.to_half());
+  float denom = __half2float(b.to_half());
   return float16(num / denom);
 #else
   return float16(static_cast<float>(a) / static_cast<float>(b));
@@ -541,7 +541,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(half(a)));
+  return float16(__hneg(a.to_half()));
 }
 HOST inline float16 operator-(const float16& a) {
   float16 res;
@@ -551,7 +551,7 @@ HOST inline float16 operator-(const float16& a) {
 #else
 HOSTDEVICE inline float16 operator-(const float16& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return float16(__hneg(half(a)));
+  return float16(__hneg(a.to_half()));
 #else
   float16 res;
   res.x = a.x ^ 0x8000;
@@ -584,7 +584,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 // in __host__ __device__ function
 #if defined(__HIPCC__)
 DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(half(a), half(b));
+  return __heq(a.to_half(), b.to_half());
 }
 HOST inline bool operator==(const float16& a, const float16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
@@ -592,7 +592,7 @@ HOST inline bool operator==(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __heq(half(a), half(b));
+  return __heq(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) == static_cast<float>(b);
 #endif
@@ -601,7 +601,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(half(a), half(b));
+  return __hne(a.to_half(), b.to_half());
 }
 HOST inline bool operator!=(const float16& a, const float16& b) {
   return static_cast<float>(a) != static_cast<float>(b);
@@ -609,7 +609,7 @@ HOST inline bool operator!=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hne(half(a), half(b));
+  return __hne(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) != static_cast<float>(b);
 #endif
@@ -618,7 +618,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(half(a), half(b));
+  return __hlt(a.to_half(), b.to_half());
 }
 HOST inline bool operator<(const float16& a, const float16& b) {
   return static_cast<float>(a) < static_cast<float>(b);
@@ -626,7 +626,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hlt(half(a), half(b));
+  return __hlt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) < static_cast<float>(b);
 #endif
@@ -635,7 +635,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(half(a), half(b));
+  return __hle(a.to_half(), b.to_half());
 }
 HOST inline bool operator<=(const float16& a, const float16& b) {
   return static_cast<float>(a) <= static_cast<float>(b);
@@ -643,7 +643,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hle(half(a), half(b));
+  return __hle(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) <= static_cast<float>(b);
 #endif
@@ -652,7 +652,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(half(a), half(b));
+  return __hgt(a.to_half(), b.to_half());
 }
 HOST inline bool operator>(const float16& a, const float16& b) {
   return static_cast<float>(a) > static_cast<float>(b);
@@ -660,7 +660,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hgt(half(a), half(b));
+  return __hgt(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) > static_cast<float>(b);
 #endif
@@ -669,7 +669,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 
 #if defined(__HIPCC__)
 DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(half(a), half(b));
+  return __hge(a.to_half(), b.to_half());
 }
 HOST inline bool operator>=(const float16& a, const float16& b) {
   return static_cast<float>(a) >= static_cast<float>(b);
@@ -677,7 +677,7 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
 #else  // __HIPCC__
 HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hge(half(a), half(b));
+  return __hge(a.to_half(), b.to_half());
 #else
   return static_cast<float>(a) >= static_cast<float>(b);
 #endif
@@ -945,12 +945,12 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
 // HIPCC has compile error if call __device__ function __hisnan in __host__
 // __device__ function
 #if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__)
-DEVICE inline bool(isnan)(const float16& a) { return __hisnan(half(a)); }
+DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
 HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
 #else
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
-  return __hisnan(half(a));
+  return __hisnan(a.to_half());
 #else
   return (a.x & 0x7fff) > 0x7c00;
 #endif
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 75e35d398c2..8be774441fe 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -48,8 +48,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));                             \
     in2 = reinterpret_cast<half *>(malloc(size));                             \
     out = reinterpret_cast<half *>(malloc(size));                             \
-    in1[0] = half(float16(v_in1));                                            \
-    in2[0] = half(float16(v_in2));                                            \
+    in1[0] = float16(v_in1).to_half();                                        \
+    in2[0] = float16(v_in2).to_half();                                        \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
@@ -73,8 +73,8 @@ limitations under the License. */
     hipMalloc(reinterpret_cast<void **>(&d_in2), size);                \
     in1 = reinterpret_cast<half *>(malloc(size));                      \
     in2 = reinterpret_cast<half *>(malloc(size));                      \
-    in1[0] = half(float16(v_in1));                                     \
-    in2[0] = half(float16(v_in2));                                     \
+    in1[0] = float16(v_in1).to_half();                                 \
+    in2[0] = float16(v_in2).to_half();                                 \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \
@@ -99,8 +99,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));                             \
     in2 = reinterpret_cast<half *>(malloc(size));                             \
     out = reinterpret_cast<bool *>(malloc(1));                                \
-    in1[0] = half(float16(v_in1));                                            \
-    in2[0] = half(float16(v_in2));                                            \
+    in1[0] = float16(v_in1).to_half();                                        \
+    in2[0] = float16(v_in2).to_half();                                        \
     hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);                       \
     hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);                       \
     hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \
@@ -126,8 +126,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));             \
     in2 = reinterpret_cast<half *>(malloc(size));             \
     out = reinterpret_cast<half *>(malloc(size));             \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
+    in1[0] = float16(v_in1).to_half();                        \
+    in2[0] = float16(v_in2).to_half();                        \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
@@ -151,8 +151,8 @@ limitations under the License. */
     cudaMalloc(reinterpret_cast<void **>(&d_in2), size);      \
     in1 = reinterpret_cast<half *>(malloc(size));             \
     in2 = reinterpret_cast<half *>(malloc(size));             \
-    in1[0] = half(float16(v_in1));                            \
-    in2[0] = half(float16(v_in2));                            \
+    in1[0] = float16(v_in1).to_half();                        \
+    in2[0] = float16(v_in2).to_half();                        \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2);                          \
@@ -177,8 +177,8 @@ limitations under the License. */
     in1 = reinterpret_cast<half *>(malloc(size));            \
     in2 = reinterpret_cast<half *>(malloc(size));            \
     out = reinterpret_cast<bool *>(malloc(1));               \
-    in1[0] = half(float16(v_in1));                           \
-    in2[0] = half(float16(v_in2));                           \
+    in1[0] = float16(v_in1).to_half();                       \
+    in2[0] = float16(v_in2).to_half();                       \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
@@ -221,7 +221,7 @@ void TestNeg(float v_in, float v_out) {
   cudaMalloc(reinterpret_cast<void **>(&d_in), size);
 #endif
   in = reinterpret_cast<half *>(malloc(size));
-  in[0] = half(float16(v_in));
+  in[0] = float16(v_in).to_half();
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(d_in, in, size, hipMemcpyHostToDevice);
 #else
@@ -299,17 +299,17 @@ TEST(float16, comparision_on_gpu) {
 
 TEST(float16, conversion_on_gpu) {
   // Explicit conversion to and from cuda half
-  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
-  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
-  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
-  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
-  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
-  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
-  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+  EXPECT_EQ(float16(float16(1.0f).to_half()).x, 0x3c00);
+  EXPECT_EQ(float16(float16(0.5f).to_half()).x, 0x3800);
+  EXPECT_EQ(float16(float16(0.33333f).to_half()).x, 0x3555);
+  EXPECT_EQ(float16(float16(0.0f).to_half()).x, 0x0000);
+  EXPECT_EQ(float16(float16(-0.0f).to_half()).x, 0x8000);
+  EXPECT_EQ(float16(float16(65504.0f).to_half()).x, 0x7bff);
+  EXPECT_EQ(float16(float16(65536.0f).to_half()).x, 0x7c00);
 
   // Assignment operator
   float16 v_assign;
-  v_assign = half(float16(1.0f));
+  v_assign = float16(1.0f).to_half();
   EXPECT_EQ(v_assign.x, 0x3c00);
 }
 
-- 
GitLab