add test for float16

af37838e · Kexin Zhao · d9642cb3 · af37838e · af37838e
隐藏空白更改
内联并排

Showing with 10 addition and 9 deletion

paddle/math/float16.h paddle/math/float16.h +8 -8

paddle/math/tests/CMakeLists.txt paddle/math/tests/CMakeLists.txt +2 -1

未找到文件。
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <istream>
 #include <ostream>
-#include <cuda.h>
+#define USE_EIGEN
 #ifdef USE_EIGEN  // delete this #if macro
 #include "Eigen/src/Core/arch/CUDA/Half.h"
@@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h);
 struct PADDLE_ALIGN(2) float16 {
  uint16_t x;
-  // explicit for different types, implicit for half and Eigen::half
  PADDLE_HOSTDEVICE inline float16() {}
  PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
@@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 {
  PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {}
 #endif  // USE_EIGEN
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
  // __fp16 is a native half precision data type for arm cpu,
  // float16_t is an alias for __fp16 in arm_fp16.h,
  // which is included in arm_neon.h.
@@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 {
  }
 #endif  // USE_EIGEN
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
  PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) {
    x = *reinterpret_cast<uint16_t*>(rhs);
    return *this;
@@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 {
  }
 #endif  // USE_EIGEN
-#ifdef PADDLE_NEON
+#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \
+    defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16)
  // check whether it works or not
  PADDLE_HOSTDEVICE inline operator float16_t() const {
    float16 h = *this;
@@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) {
 __device__ inline float16 operator/(const float16& a, const float16& b) {
  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
-  // instinsic
  float num = __half2float(half(a));
  float denom = __half2float(half(b));
  return float16(num / denom);
@@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1;
 PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
  half tmp = __float2half(f);
-  return *reinterpret_cast<float16*>(&(tmp));
+  return *reinterpret_cast<float16*>(&tmp);
 #elif defined(PADDLE_NEON_64)  // test on RPI
  float16 res;

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -21,7 +21,7 @@ if(WITH_GPU)
    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
    link_paddle_test(test_Tensor)
    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-    link_paddle_test(test_lazyAssign)
+    link_paddle_test(test_lazyAssign) 
 else()
    compile_cu_as_cpp(test_Tensor.cu)
    add_unittest(test_Tensor test_Tensor.cu)
@@ -33,3 +33,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)