From 41bd1f9115c4cb8a9a9afcc656b6d0f00d9b1cb5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 28 Nov 2017 17:09:12 -0800 Subject: [PATCH] fix gpu test, clean code and add cmake --- CMakeLists.txt | 1 + cmake/configure.cmake | 5 + paddle/math/float16.h | 217 ++++++++--------------------- paddle/math/tests/test_float16.cpp | 8 -- paddle/math/tests/test_float16.cu | 90 ++++++------ 5 files changed, 109 insertions(+), 212 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3582a1bca..a2bb5d73bcf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 24ddb24399d..2c202707ff6 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -24,6 +24,11 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 65c0489e1fe..778b48bce8d 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #ifdef PADDLE_WITH_CUDA #include @@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 { public: uint16_t x; + // Constructors HOSTDEVICE inline float16() : x(0) {} HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -89,8 +90,7 @@ public: #ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 in arm_fp16.h, - // which is included in arm_neon.h. + // float16_t is an alias for __fp16 HOSTDEVICE inline explicit float16(const float16_t& h) { x = *reinterpret_cast(&h); } @@ -141,6 +141,7 @@ public: return *this; } +// Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 @@ -219,6 +220,7 @@ public: return *this; } +// Conversion opertors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit operator half() const { #if CUDA_VERSION >= 9000 @@ -353,27 +355,54 @@ private: // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // CUDA 9.0 regarding the half data type. -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000 +#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 + DEVICE inline half operator+(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hadd(a, b); +#else + float res = float(float16(a)) + float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator-(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hsub(a, b); +#else + float res = float(float16(a)) - float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator*(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hmul(a, b); +#else + float res = float(float16(a)) * float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator/(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); +#else + float res = float(float16(a)) / float(float16(b)); + return half(float16(res)); +#endif } -DEVICE inline half operator-(const half& a) { return __hneg(a); } +DEVICE inline half operator-(const half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hneg(a); +#else + float res = -float(float16(a)); + return half(float16(res)); +#endif +} DEVICE inline half& operator+=(half& a, const half& b) { a = a + b; @@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) { } DEVICE inline bool operator==(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __heq(a, b); +#else + return float(float16(a)) == float(float16(b)); +#endif } DEVICE inline bool operator!=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hne(a, b); +#else + return float(float16(a)) != float(float16(b)); +#endif } DEVICE inline bool operator<(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hlt(a, b); +#else + return float(float16(a)) < float(float16(b)); +#endif } DEVICE inline bool operator<=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hle(a, b); +#else + return float(float16(a)) <= float(float16(b)); +#endif } DEVICE inline bool operator>(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hgt(a, b); +#else + return float(float16(a)) > float(float16(b)); +#endif } DEVICE inline bool operator>=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hge(a, b); +#else + return float(float16(a)) >= float(float16(b)); +#endif } -/* -DEVICE inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(half(a), half(b))); -} - -DEVICE inline float16 operator-(const float16& a, const float16& b) { - return float16(__hsub(half(a), half(b))); -} - -DEVICE inline float16 operator*(const float16& a, const float16& b) { - return float16(__hmul(half(a), half(b))); -} - -DEVICE inline float16 operator/(const float16& a, const float16& b) { - float num = __half2float(half(a)); - float denom = __half2float(half(b)); - return float16(num / denom); -} - -DEVICE inline float16 operator-(const float16& a) { - return float16(__hneg(half(a))); -} - -DEVICE inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -DEVICE inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -DEVICE inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -DEVICE inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -DEVICE inline bool operator==(const float16& a, const float16& b) { - return __heq(half(a), half(b)); -} - -DEVICE inline bool operator!=(const float16& a, const float16& b) { - return __hne(half(a), half(b)); -} - -DEVICE inline bool operator<(const float16& a, const float16& b) { - return __hlt(half(a), half(b)); -} - -DEVICE inline bool operator<=(const float16& a, const float16& b) { - return __hle(half(a), half(b)); -} - -DEVICE inline bool operator>(const float16& a, const float16& b) { - return __hgt(half(a), half(b)); -} - -DEVICE inline bool operator>=(const float16& a, const float16& b) { - return __hge(half(a), half(b)); -} -*/ +#endif // PADDLE_CUDA_FP16 // Arithmetic operators on ARMv8.2-A CPU -#elif defined(PADDLE_WITH_NATIVE_FP16) +#if defined(PADDLE_WITH_NATIVE_FP16) HOST inline float16 operator+(const float16& a, const float16& b) { float16 res; asm volatile( @@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) { return (res & 0xffff) != 0; } -/* -HOST inline float16 operator+(const float16& a, const float16& b) { - return float16(vaddh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a, const float16& b) { - return float16(vsubh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator*(const float16& a, const float16& b) { - return float16(vmulh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator/(const float16& a, const float16& b) { - return float16(vdivh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a) { - return float16(vnegh_f16(float16_t(a))); -} - -HOST inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -HOST inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -HOST inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -HOST inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -HOST inline bool operator==(const float16& a, const float16& b) { - return static_cast(vceqh_f16(float16_t(a), float16_t(b))); -} - -HOST inline bool operator!=(const float16& a, const float16& b) { - return !(a == b); -} - -HOST inline bool operator<(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vclth_f16(float16_t(a), float16_t(b))); -#else - return float(a) < float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator<=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcleh_f16(float16_t(a), float16_t(b))); -#else - return float(a) <= float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgth_f16(float16_t(a), float16_t(b))); -#else - return float(a) > float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); -#else - return float(a) >= float(b); -#endif // PADDLE_NEON_64 -} -*/ - // Arithmetic operators, software emulated on other CPU #else HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index f5541d8f0fc..74cc55aa379 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(true).x, 0x3c00); EXPECT_EQ(float16(false).x, 0x0000); - // Implicit conversion to and from Eigen::half - /* - Eigen::half tmp = float16(1.0f); - float16 v_conv = tmp; - EXPECT_EQ(tmp.x, 0x3c00); - EXPECT_EQ(v_conv.x, 0x3c00); - */ - // Default constructor float16 v_def; EXPECT_EQ(v_def.x, 0x0000); diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 941f2666035..4b520feaaf5 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -15,41 +15,38 @@ limitations under the License. */ #include "paddle/utils/Logging.h" -#define ARITHMETIC_KERNEL(op_type, sign) \ - __global__ void op_type( \ - const float16* in1, const float16* in2, float16* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define ARITHMETIC_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, half* out) { \ + out[0] = in1[0] sign in2[0]; \ } -#define COMPOUND_KERNEL(op_type, sign) \ - __global__ void op_type(float16* in1, const float16* in2) { \ - in1[0] sign in2[0]; \ - } +#define COMPOUND_KERNEL(op_type, sign) \ + __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; } -#define COMPARISON_KERNEL(op_type, sign) \ - __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define COMPARISON_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, bool* out) { \ + out[0] = in1[0] sign in2[0]; \ } #define ARITHMETIC_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2, *out; \ - float16 *d_in1, *d_in2, *d_out; \ - int size = sizeof(float16); \ + half *in1, *in2, *out; \ + half *d_in1, *d_in2, *d_out; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - out = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + out = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(out[0]), v_out); \ + EXPECT_EQ(float(float16(out[0])), v_out); \ free(in1); \ free(in2); \ free(out); \ @@ -61,20 +58,20 @@ limitations under the License. */ #define COMPOUND_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ - int size = sizeof(float16); \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(in1[0]), v_out); \ + EXPECT_EQ(float(float16(in1[0])), v_out); \ free(in1); \ free(in2); \ cudaFree(d_in1); \ @@ -84,18 +81,18 @@ limitations under the License. */ #define COMPARISON_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, bool v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ bool *out, *d_out; \ - int size = sizeof(float16); \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, 1); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ out = (bool*)malloc(1); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ @@ -112,6 +109,7 @@ limitations under the License. */ #ifdef PADDLE_CUDA_FP16 namespace paddle { +#if CUDA_VERSION < 9000 ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Mul, *) @@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul) ARITHMETIC_KERNEL_LAUNCH(Div) // Negative sign kernel -__global__ void Neg(float16* in) { in[0] = -in[0]; } +__global__ void Neg(half* in) { in[0] = -in[0]; } void TestNeg(float v_in, float v_out) { LOG(INFO) << "Test Neg on GPU!"; - float16 *in, *d_in; - int size = sizeof(float16); + half *in, *d_in; + int size = sizeof(half); cudaMalloc((void**)&d_in, size); - in = (float16*)malloc(size); - in[0] = float16(v_in); + in = (half*)malloc(size); + in[0] = half(float16(v_in)); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); Neg<<<1, 1>>>(d_in); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); - EXPECT_EQ(float(in[0]), v_out); + EXPECT_EQ(float(float16(in[0])), v_out); free(in); cudaFree(d_in); } @@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) { TestGreaterEqual(4, 4, true); TestGreaterEqual(4, 5, false); } +#endif // CUDA_VERSION TEST(float16, conversion_on_gpu) { // Explicit conversion to and from cuda half @@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) { EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); - // Implicit conversion to and from cuda half - half tmp = float16(1.0f); - float16 val = tmp; - EXPECT_EQ(val.x, 0x3c00); - // Assignment operator float16 v_assign; - v_assign = tmp; + v_assign = half(float16(1.0f)); EXPECT_EQ(v_assign.x, 0x3c00); } } // namespace paddle -#endif +#endif // PADDLE_CUDA_FP16 -- GitLab