diff --git a/paddle/math/float16.h b/paddle/math/float16.h index ae7d9754aa336d1ff3c13d8c56207f58b4f17e5f..e9d4e6737dc145c3d2ab803bfe5bd84f6a7744b9 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include +#define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro #include "Eigen/src/Core/arch/CUDA/Half.h" @@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h); struct PADDLE_ALIGN(2) float16 { uint16_t x; - // explicit for different types, implicit for half and Eigen::half - PADDLE_HOSTDEVICE inline float16() {} PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. @@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { x = *reinterpret_cast(rhs); return *this; @@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; @@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) { __device__ inline float16 operator/(const float16& a, const float16& b) { // TODO(kexinzhao): check the cuda version that starts to support __hdiv - // instinsic float num = __half2float(half(a)); float denom = __half2float(half(b)); return float16(num / denom); @@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1; PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 half tmp = __float2half(f); - return *reinterpret_cast(&(tmp)); + return *reinterpret_cast(&tmp); #elif defined(PADDLE_NEON_64) // test on RPI float16 res; diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index d8b7f9e3fc74040189ade83049e4a1c3348e08de..ab4ac38b3c5aea5239b2b52efe20dffefa62dba1 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -21,7 +21,7 @@ if(WITH_GPU) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) - link_paddle_test(test_lazyAssign) + link_paddle_test(test_lazyAssign) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) @@ -33,3 +33,4 @@ add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) +add_simple_unittest(test_float16)