add float16 arithmetic operators on new GPU

18d616ed · Kexin Zhao · d03dbb97 · 18d616ed · 18d616ed
隐藏空白更改
内联并排

Showing with 82 addition and 7 deletion

paddle/fluid/platform/float16.h paddle/fluid/platform/float16.h +72 -3

python/paddle/fluid/tests/unittests/test_dropout_op.py python/paddle/fluid/tests/unittests/test_dropout_op.py +10 -4

未找到文件。
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -483,8 +483,77 @@ DEVICE inline bool operator>=(const half& a, const half& b) {

 #endif  // PADDLE_CUDA_FP16

-// Arithmetic operators on ARMv8.2-A CPU
-#if defined(PADDLE_WITH_NATIVE_FP16)
+// Arithmetic operators for float16 on GPU
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+DEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(__hadd(half(a), half(b)));
+}
+
+DEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(__hsub(half(a), half(b)));
+}
+
+DEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(__hmul(half(a), half(b)));
+}
+
+DEVICE inline float16 operator/(const float16& a, const float16& b) {
+  // TODO(kexinzhao): check the cuda version that starts to support __hdiv
+  float num = __half2float(half(a));
+  float denom = __half2float(half(b));
+  return float16(num / denom);
+}
+
+DEVICE inline float16 operator-(const float16& a) {
+  return float16(__hneg(half(a)));
+}
+
+DEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const float16& a, const float16& b) {
+  return __heq(half(a), half(b));
+}
+
+DEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return __hne(half(a), half(b));
+}
+
+DEVICE inline bool operator<(const float16& a, const float16& b) {
+  return __hlt(half(a), half(b));
+}
+
+DEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return __hle(half(a), half(b));
+}
+
+DEVICE inline bool operator>(const float16& a, const float16& b) {
+  return __hgt(half(a), half(b));
+}
+
+DEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return __hge(half(a), half(b));
+}
+
+// Arithmetic operators for float16 on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
 HOST inline float16 operator+(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
@@ -668,7 +737,7 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }

-// Arithmetic operators, software emulated on other CPU
+// Arithmetic operators for float16, software emulated on other CPU/GPU
 #else
 HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
  return float16(float(a) + float(b));

--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -86,10 +86,13 @@ class TestDropoutOp5(OpTest):
 class TestFP16DropoutOp1(OpTest):
    def setUp(self):
        x = np.random.random((32, 64)).astype("float16")
+        prob = 0.35
+        out = x * (1.0 - prob)
+
        self.op_type = "dropout"
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
-        self.outputs = {'Out': x * (1.0 - self.attrs['dropout_prob'])}
+        self.attrs = {'dropout_prob': prob, 'fix_seed': True, 'is_test': True}
+        self.outputs = {'Out': out}

    def test_check_output(self):
        if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
@@ -99,10 +102,13 @@ class TestFP16DropoutOp1(OpTest):
 class TestFP16DropoutOp2(OpTest):
    def setUp(self):
        x = np.random.random((32, 64, 3)).astype("float16")
+        prob = 0.75
+        out = x * (1.0 - prob)
+
        self.op_type = "dropout"
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
-        self.outputs = {'Out': x * (1.0 - self.attrs['dropout_prob'])}
+        self.attrs = {'dropout_prob': prob, 'is_test': True}
+        self.outputs = {'Out': out}

    def test_check_output(self):
        if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):