fix gpu test, clean code and add cmake

41bd1f91 · Kexin Zhao · a5feb771 · 41bd1f91 · 41bd1f91 · 41bd1f91
5 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
 if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)

--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
-#include <cstdint>
+#include <stdint.h>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 {
 public:
  uint16_t x;
+  // Constructors
  HOSTDEVICE inline float16() : x(0) {}
  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
@@ -89,8 +90,7 @@ public:
 #ifdef PADDLE_WITH_NATIVE_FP16
  // __fp16 is a native half precision data type for arm cpu,
-  // float16_t is an alias for __fp16 in arm_fp16.h,
+  // float16_t is an alias for __fp16
-  // which is included in arm_neon.h.
  HOSTDEVICE inline explicit float16(const float16_t& h) {
    x = *reinterpret_cast<const uint16_t*>(&h);
  }
@@ -141,6 +141,7 @@ public:
    return *this;
  }
+// Assignment operators
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline float16& operator=(const half& rhs) {
 #if CUDA_VERSION >= 9000
@@ -219,6 +220,7 @@ public:
    return *this;
  }
+// Conversion opertors
 #ifdef PADDLE_CUDA_FP16
  HOSTDEVICE inline explicit operator half() const {
 #if CUDA_VERSION >= 9000
@@ -353,27 +355,54 @@ private:
 // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
 // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
 // CUDA 9.0 regarding the half data type.
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
-    __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000
 DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
 }
 DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
 }
 DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
 }
 DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
  float num = __half2float(a);
  float denom = __half2float(b);
  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
 }
-DEVICE inline half operator-(const half& a) { return __hneg(a); }
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
 DEVICE inline half& operator+=(half& a, const half& b) {
  a = a + b;
@@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) {
 }
 DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
 }
 DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
 }
 DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
 }
 DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
 }
 DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
 }
 DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
 }
-/*
+#endif  // PADDLE_CUDA_FP16
-DEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(__hadd(half(a), half(b)));
-}
-DEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(__hsub(half(a), half(b)));
-}
-DEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(__hmul(half(a), half(b)));
-}
-DEVICE inline float16 operator/(const float16& a, const float16& b) {
-  float num = __half2float(half(a));
-  float denom = __half2float(half(b));
-  return float16(num / denom);
-}
-DEVICE inline float16 operator-(const float16& a) {
-  return float16(__hneg(half(a)));
-}
-DEVICE inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-DEVICE inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-DEVICE inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-DEVICE inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-DEVICE inline bool operator==(const float16& a, const float16& b) {
-  return __heq(half(a), half(b));
-}
-DEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return __hne(half(a), half(b));
-}
-DEVICE inline bool operator<(const float16& a, const float16& b) {
-  return __hlt(half(a), half(b));
-}
-DEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return __hle(half(a), half(b));
-}
-DEVICE inline bool operator>(const float16& a, const float16& b) {
-  return __hgt(half(a), half(b));
-}
-DEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return __hge(half(a), half(b));
-}
-*/
 // Arithmetic operators on ARMv8.2-A CPU
-#elif defined(PADDLE_WITH_NATIVE_FP16)
+#if defined(PADDLE_WITH_NATIVE_FP16)
 HOST inline float16 operator+(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
@@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }
-/*
-HOST inline float16 operator+(const float16& a, const float16& b) {
-  return float16(vaddh_f16(float16_t(a), float16_t(b)));
-}
-HOST inline float16 operator-(const float16& a, const float16& b) {
-  return float16(vsubh_f16(float16_t(a), float16_t(b)));
-}
-HOST inline float16 operator*(const float16& a, const float16& b) {
-  return float16(vmulh_f16(float16_t(a), float16_t(b)));
-}
-HOST inline float16 operator/(const float16& a, const float16& b) {
-  return float16(vdivh_f16(float16_t(a), float16_t(b)));
-}
-HOST inline float16 operator-(const float16& a) {
-  return float16(vnegh_f16(float16_t(a)));
-}
-HOST inline float16& operator+=(float16& a, const float16& b) {
-  a = a + b;
-  return a;
-}
-HOST inline float16& operator-=(float16& a, const float16& b) {
-  a = a - b;
-  return a;
-}
-HOST inline float16& operator*=(float16& a, const float16& b) {
-  a = a * b;
-  return a;
-}
-HOST inline float16& operator/=(float16& a, const float16& b) {
-  a = a / b;
-  return a;
-}
-HOST inline bool operator==(const float16& a, const float16& b) {
-  return static_cast<bool>(vceqh_f16(float16_t(a), float16_t(b)));
-}
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return !(a == b);
-}
-HOST inline bool operator<(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vclth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) < float(b);
-#endif  // PADDLE_NEON_64
-}
-HOST inline bool operator<=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcleh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) <= float(b);
-#endif  // PADDLE_NEON_64
-}
-HOST inline bool operator>(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgth_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) > float(b);
-#endif  // PADDLE_NEON_64
-}
-HOST inline bool operator>=(const float16& a, const float16& b) {
-#ifdef PADDLE_NEON_64
-  return static_cast<bool>(vcgeh_f16(float16_t(a), float16_t(b)));
-#else
-  return float(a) >= float(b);
-#endif  // PADDLE_NEON_64
-}
-*/
 // Arithmetic operators, software emulated on other CPU
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {

--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
@@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) {
  EXPECT_EQ(float16(true).x, 0x3c00);
  EXPECT_EQ(float16(false).x, 0x0000);
-  // Implicit conversion to and from Eigen::half
-  /*
-  Eigen::half tmp = float16(1.0f);
-  float16 v_conv = tmp;
-  EXPECT_EQ(tmp.x, 0x3c00);
-  EXPECT_EQ(v_conv.x, 0x3c00);
-  */
  // Default constructor
  float16 v_def;
  EXPECT_EQ(v_def.x, 0x0000);

--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
@@ -16,40 +16,37 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(                                    \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
-      const float16* in1, const float16* in2, float16* out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }
 #define COMPOUND_KERNEL(op_type, sign) \
-  __global__ void op_type(float16* in1, const float16* in2) { \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
-    in1[0] sign in2[0];                                       \
-  }
 #define COMPARISON_KERNEL(op_type, sign)                                 \
-  __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
    out[0] = in1[0] sign in2[0];                                         \
  }
 #define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
  void Test##op_type(float v_in1, float v_in2, float v_out) { \
    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    float16 *in1, *in2, *out;                                 \
+    half *in1, *in2, *out;                                    \
-    float16 *d_in1, *d_in2, *d_out;                           \
+    half *d_in1, *d_in2, *d_out;                              \
-    int size = sizeof(float16);                               \
+    int size = sizeof(half);                                  \
    cudaMalloc((void**)&d_in1, size);                         \
    cudaMalloc((void**)&d_in2, size);                         \
    cudaMalloc((void**)&d_out, size);                         \
-    in1 = (float16*)malloc(size);                             \
+    in1 = (half*)malloc(size);                                \
-    in2 = (float16*)malloc(size);                             \
+    in2 = (half*)malloc(size);                                \
-    out = (float16*)malloc(size);                             \
+    out = (half*)malloc(size);                                \
-    in1[0] = float16(v_in1);                                  \
+    in1[0] = half(float16(v_in1));                            \
-    in2[0] = float16(v_in2);                                  \
+    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(out[0]), v_out);                          \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
    free(in1);                                                \
    free(in2);                                                \
    free(out);                                                \
@@ -61,20 +58,20 @@ limitations under the License. */
 #define COMPOUND_KERNEL_LAUNCH(op_type)                       \
  void Test##op_type(float v_in1, float v_in2, float v_out) { \
    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
-    float16 *in1, *in2;                                       \
+    half *in1, *in2;                                          \
-    float16 *d_in1, *d_in2;                                   \
+    half *d_in1, *d_in2;                                      \
-    int size = sizeof(float16);                               \
+    int size = sizeof(half);                                  \
    cudaMalloc((void**)&d_in1, size);                         \
    cudaMalloc((void**)&d_in2, size);                         \
-    in1 = (float16*)malloc(size);                             \
+    in1 = (half*)malloc(size);                                \
-    in2 = (float16*)malloc(size);                             \
+    in2 = (half*)malloc(size);                                \
-    in1[0] = float16(v_in1);                                  \
+    in1[0] = half(float16(v_in1));                            \
-    in2[0] = float16(v_in2);                                  \
+    in2[0] = half(float16(v_in2));                            \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
    op_type<<<1, 1>>>(d_in1, d_in2);                          \
    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(in1[0]), v_out);                          \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
    free(in1);                                                \
    free(in2);                                                \
    cudaFree(d_in1);                                          \
@@ -84,18 +81,18 @@ limitations under the License. */
 #define COMPARISON_KERNEL_LAUNCH(op_type)                    \
  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
-    float16 *in1, *in2;                                      \
+    half *in1, *in2;                                         \
-    float16 *d_in1, *d_in2;                                  \
+    half *d_in1, *d_in2;                                     \
    bool *out, *d_out;                                       \
-    int size = sizeof(float16);                              \
+    int size = sizeof(half);                                 \
    cudaMalloc((void**)&d_in1, size);                        \
    cudaMalloc((void**)&d_in2, size);                        \
    cudaMalloc((void**)&d_out, 1);                           \
-    in1 = (float16*)malloc(size);                            \
+    in1 = (half*)malloc(size);                               \
-    in2 = (float16*)malloc(size);                            \
+    in2 = (half*)malloc(size);                               \
    out = (bool*)malloc(1);                                  \
-    in1[0] = float16(v_in1);                                 \
+    in1[0] = half(float16(v_in1));                           \
-    in2[0] = float16(v_in2);                                 \
+    in2[0] = half(float16(v_in2));                           \
    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
@@ -112,6 +109,7 @@ limitations under the License. */
 #ifdef PADDLE_CUDA_FP16
 namespace paddle {
+#if CUDA_VERSION < 9000
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
@@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul)
 ARITHMETIC_KERNEL_LAUNCH(Div)
 // Negative sign kernel
-__global__ void Neg(float16* in) { in[0] = -in[0]; }
+__global__ void Neg(half* in) { in[0] = -in[0]; }
 void TestNeg(float v_in, float v_out) {
  LOG(INFO) << "Test Neg on GPU!";
-  float16 *in, *d_in;
+  half *in, *d_in;
-  int size = sizeof(float16);
+  int size = sizeof(half);
  cudaMalloc((void**)&d_in, size);
-  in = (float16*)malloc(size);
+  in = (half*)malloc(size);
-  in[0] = float16(v_in);
+  in[0] = half(float16(v_in));
  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
  Neg<<<1, 1>>>(d_in);
  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(in[0]), v_out);
+  EXPECT_EQ(float(float16(in[0])), v_out);
  free(in);
  cudaFree(d_in);
 }
@@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) {
  TestGreaterEqual(4, 4, true);
  TestGreaterEqual(4, 5, false);
 }
+#endif  // CUDA_VERSION
 TEST(float16, conversion_on_gpu) {
  // Explicit conversion to and from cuda half
@@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) {
  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
-  // Implicit conversion to and from cuda half
-  half tmp = float16(1.0f);
-  float16 val = tmp;
-  EXPECT_EQ(val.x, 0x3c00);
  // Assignment operator
  float16 v_assign;
-  v_assign = tmp;
+  v_assign = half(float16(1.0f));
  EXPECT_EQ(v_assign.x, 0x3c00);
 }
 }  // namespace paddle
-#endif
+#endif  // PADDLE_CUDA_FP16