From 783759907592180e6fddff295add50dec3d9403a Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Mon, 27 Dec 2021 14:19:10 +0800 Subject: [PATCH] refine float16 implementation (#38439) --- paddle/fluid/operators/clip_op.h | 2 +- .../device/gpu/cuda/cuda_device_function.h | 4 +- paddle/fluid/platform/float16.h | 66 +++++++++---------- paddle/fluid/platform/float16_test.cu | 42 ++++++------ 4 files changed, 57 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index abf721936b4..116edc0390e 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -95,7 +95,7 @@ class ClipKernel : public framework::OpKernel { platform::errors::InvalidArgument( "max should be greater than or equal to min. " "But received min = %f, max = %f", - min, max)); + static_cast(min), static_cast(max))); auto* x_var = context.InputVar("X"); if (x_var->IsType()) { diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index e7d80757395..7fe2367b551 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -73,7 +73,7 @@ template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { - return float16(__shfl_down_sync(mask, static_cast(val), + return float16(__shfl_down_sync(mask, val.to_half(), static_cast(delta), width)); } @@ -103,7 +103,7 @@ CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, template <> __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, float16 val, int width) { - return float16(__shfl_xor_sync(mask, static_cast(val), width)); + return float16(__shfl_xor_sync(mask, val.to_half(), width)); } template <> diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index bdd4d54b3d1..b6d088421af 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -214,7 +214,7 @@ struct PADDLE_ALIGN(2) float16 { // Conversion opertors #ifdef PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit operator half() const { + HOSTDEVICE inline half to_half() const { #if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000 __half_raw h; h.x = x; @@ -233,7 +233,7 @@ struct PADDLE_ALIGN(2) float16 { } #endif - HOSTDEVICE inline explicit operator float() const { + HOSTDEVICE inline operator float() const { #if defined(PADDLE_CUDA_FP16) && \ (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300)) half tmp = *reinterpret_cast(this); @@ -302,7 +302,7 @@ struct PADDLE_ALIGN(2) float16 { return static_cast(static_cast(*this)); } - HOSTDEVICE inline explicit operator double() const { + HOSTDEVICE inline operator double() const { return static_cast(static_cast(*this)); } @@ -350,7 +350,7 @@ DEVICE inline half operator+(const half& a, const half& b) { return __hadd(a, b); #else float res = static_cast(float16(a)) + static_cast(float16(b)); - return half(float16(res)); + return float16(res).to_half(); #endif } @@ -359,7 +359,7 @@ DEVICE inline half operator-(const half& a, const half& b) { return __hsub(a, b); #else float res = static_cast(float16(a)) - static_cast(float16(b)); - return half(float16(res)); + return float16(res).to_half(); #endif } @@ -368,7 +368,7 @@ DEVICE inline half operator*(const half& a, const half& b) { return __hmul(a, b); #else float res = static_cast(float16(a)) * static_cast(float16(b)); - return half(float16(res)); + return float16(res).to_half(); #endif } @@ -379,7 +379,7 @@ DEVICE inline half operator/(const half& a, const half& b) { return __float2half(num / denom); #else float res = static_cast(float16(a)) / static_cast(float16(b)); - return half(float16(res)); + return float16(res).to_half(); #endif } @@ -388,7 +388,7 @@ DEVICE inline half operator-(const half& a) { return __hneg(a); #else float res = -static_cast(float16(a)); - return half(float16(res)); + return float16(res).to_half(); #endif } @@ -470,7 +470,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) { // in __host__ __device__ function #if defined(__HIPCC__) DEVICE inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(half(a), half(b))); + return float16(__hadd(a.to_half(), b.to_half())); } HOST inline float16 operator+(const float16& a, const float16& b) { return float16(static_cast(a) + static_cast(b)); @@ -478,7 +478,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) { #else HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hadd(half(a), half(b))); + return float16(__hadd(a.to_half(), b.to_half())); #else return float16(static_cast(a) + static_cast(b)); #endif @@ -487,7 +487,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline float16 operator-(const float16& a, const float16& b) { - return float16(__hsub(half(a), half(b))); + return float16(__hsub(a.to_half(), b.to_half())); } HOST inline float16 operator-(const float16& a, const float16& b) { return float16(static_cast(a) - static_cast(b)); @@ -495,7 +495,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) { #else HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hsub(half(a), half(b))); + return float16(__hsub(a.to_half(), b.to_half())); #else return float16(static_cast(a) - static_cast(b)); #endif @@ -504,7 +504,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline float16 operator*(const float16& a, const float16& b) { - return float16(__hmul(half(a), half(b))); + return float16(__hmul(a.to_half(), b.to_half())); } HOST inline float16 operator*(const float16& a, const float16& b) { return float16(static_cast(a) * static_cast(b)); @@ -512,7 +512,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) { #else HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hmul(half(a), half(b))); + return float16(__hmul(a.to_half(), b.to_half())); #else return float16(static_cast(a) * static_cast(b)); #endif @@ -521,7 +521,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline float16 operator/(const float16& a, const float16& b) { - return float16(__hdiv(half(a), half(b))); + return float16(__hdiv(a.to_half(), b.to_half())); } HOST inline float16 operator/(const float16& a, const float16& b) { return float16(static_cast(a) / static_cast(b)); @@ -530,8 +530,8 @@ HOST inline float16 operator/(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 // TODO(kexinzhao): check which cuda version starts to support __hdiv - float num = __half2float(half(a)); - float denom = __half2float(half(b)); + float num = __half2float(a.to_half()); + float denom = __half2float(b.to_half()); return float16(num / denom); #else return float16(static_cast(a) / static_cast(b)); @@ -541,7 +541,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline float16 operator-(const float16& a) { - return float16(__hneg(half(a))); + return float16(__hneg(a.to_half())); } HOST inline float16 operator-(const float16& a) { float16 res; @@ -551,7 +551,7 @@ HOST inline float16 operator-(const float16& a) { #else HOSTDEVICE inline float16 operator-(const float16& a) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return float16(__hneg(half(a))); + return float16(__hneg(a.to_half())); #else float16 res; res.x = a.x ^ 0x8000; @@ -584,7 +584,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT // in __host__ __device__ function #if defined(__HIPCC__) DEVICE inline bool operator==(const float16& a, const float16& b) { - return __heq(half(a), half(b)); + return __heq(a.to_half(), b.to_half()); } HOST inline bool operator==(const float16& a, const float16& b) { return static_cast(a) == static_cast(b); @@ -592,7 +592,7 @@ HOST inline bool operator==(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __heq(half(a), half(b)); + return __heq(a.to_half(), b.to_half()); #else return static_cast(a) == static_cast(b); #endif @@ -601,7 +601,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline bool operator!=(const float16& a, const float16& b) { - return __hne(half(a), half(b)); + return __hne(a.to_half(), b.to_half()); } HOST inline bool operator!=(const float16& a, const float16& b) { return static_cast(a) != static_cast(b); @@ -609,7 +609,7 @@ HOST inline bool operator!=(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hne(half(a), half(b)); + return __hne(a.to_half(), b.to_half()); #else return static_cast(a) != static_cast(b); #endif @@ -618,7 +618,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline bool operator<(const float16& a, const float16& b) { - return __hlt(half(a), half(b)); + return __hlt(a.to_half(), b.to_half()); } HOST inline bool operator<(const float16& a, const float16& b) { return static_cast(a) < static_cast(b); @@ -626,7 +626,7 @@ HOST inline bool operator<(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hlt(half(a), half(b)); + return __hlt(a.to_half(), b.to_half()); #else return static_cast(a) < static_cast(b); #endif @@ -635,7 +635,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline bool operator<=(const float16& a, const float16& b) { - return __hle(half(a), half(b)); + return __hle(a.to_half(), b.to_half()); } HOST inline bool operator<=(const float16& a, const float16& b) { return static_cast(a) <= static_cast(b); @@ -643,7 +643,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hle(half(a), half(b)); + return __hle(a.to_half(), b.to_half()); #else return static_cast(a) <= static_cast(b); #endif @@ -652,7 +652,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline bool operator>(const float16& a, const float16& b) { - return __hgt(half(a), half(b)); + return __hgt(a.to_half(), b.to_half()); } HOST inline bool operator>(const float16& a, const float16& b) { return static_cast(a) > static_cast(b); @@ -660,7 +660,7 @@ HOST inline bool operator>(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hgt(half(a), half(b)); + return __hgt(a.to_half(), b.to_half()); #else return static_cast(a) > static_cast(b); #endif @@ -669,7 +669,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { #if defined(__HIPCC__) DEVICE inline bool operator>=(const float16& a, const float16& b) { - return __hge(half(a), half(b)); + return __hge(a.to_half(), b.to_half()); } HOST inline bool operator>=(const float16& a, const float16& b) { return static_cast(a) >= static_cast(b); @@ -677,7 +677,7 @@ HOST inline bool operator>=(const float16& a, const float16& b) { #else // __HIPCC__ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hge(half(a), half(b)); + return __hge(a.to_half(), b.to_half()); #else return static_cast(a) >= static_cast(b); #endif @@ -945,12 +945,12 @@ HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) { // HIPCC has compile error if call __device__ function __hisnan in __host__ // __device__ function #if defined(PADDLE_CUDA_FP16) && defined(__HIPCC__) -DEVICE inline bool(isnan)(const float16& a) { return __hisnan(half(a)); } +DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); } HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; } #else HOSTDEVICE inline bool(isnan)(const float16& a) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return __hisnan(half(a)); + return __hisnan(a.to_half()); #else return (a.x & 0x7fff) > 0x7c00; #endif diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 75e35d398c2..8be774441fe 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -48,8 +48,8 @@ limitations under the License. */ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ out = reinterpret_cast(malloc(size)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \ hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \ hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \ @@ -73,8 +73,8 @@ limitations under the License. */ hipMalloc(reinterpret_cast(&d_in2), size); \ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \ hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \ hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2); \ @@ -99,8 +99,8 @@ limitations under the License. */ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ out = reinterpret_cast(malloc(1)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice); \ hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice); \ hipLaunchKernelGGL(op_type, dim3(1), dim3(1), 0, 0, d_in1, d_in2, d_out); \ @@ -126,8 +126,8 @@ limitations under the License. */ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ out = reinterpret_cast(malloc(size)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ @@ -151,8 +151,8 @@ limitations under the License. */ cudaMalloc(reinterpret_cast(&d_in2), size); \ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ @@ -177,8 +177,8 @@ limitations under the License. */ in1 = reinterpret_cast(malloc(size)); \ in2 = reinterpret_cast(malloc(size)); \ out = reinterpret_cast(malloc(1)); \ - in1[0] = half(float16(v_in1)); \ - in2[0] = half(float16(v_in2)); \ + in1[0] = float16(v_in1).to_half(); \ + in2[0] = float16(v_in2).to_half(); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ @@ -221,7 +221,7 @@ void TestNeg(float v_in, float v_out) { cudaMalloc(reinterpret_cast(&d_in), size); #endif in = reinterpret_cast(malloc(size)); - in[0] = half(float16(v_in)); + in[0] = float16(v_in).to_half(); #ifdef PADDLE_WITH_HIP hipMemcpy(d_in, in, size, hipMemcpyHostToDevice); #else @@ -299,17 +299,17 @@ TEST(float16, comparision_on_gpu) { TEST(float16, conversion_on_gpu) { // Explicit conversion to and from cuda half - EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00); - EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800); - EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555); - EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000); - EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000); - EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); - EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); + EXPECT_EQ(float16(float16(1.0f).to_half()).x, 0x3c00); + EXPECT_EQ(float16(float16(0.5f).to_half()).x, 0x3800); + EXPECT_EQ(float16(float16(0.33333f).to_half()).x, 0x3555); + EXPECT_EQ(float16(float16(0.0f).to_half()).x, 0x0000); + EXPECT_EQ(float16(float16(-0.0f).to_half()).x, 0x8000); + EXPECT_EQ(float16(float16(65504.0f).to_half()).x, 0x7bff); + EXPECT_EQ(float16(float16(65536.0f).to_half()).x, 0x7c00); // Assignment operator float16 v_assign; - v_assign = half(float16(1.0f)); + v_assign = float16(1.0f).to_half(); EXPECT_EQ(v_assign.x, 0x3c00); } -- GitLab