Merge pull request #9231 from kexinzhao/elementwise_add_fp16

Add float16 support to Elementwise Add op

Merge pull request #9231 from kexinzhao/elementwise_add_fp16
Add float16 support to Elementwise Add op
c1e9b1e3 · Kexin Zhao · GitHub · d126933d · d307b5e4 · c1e9b1e3
3 changed file
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -14,19 +14,20 @@ limitations under the License. */
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_add,
+    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
-                                  int64_t>);
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -600,7 +600,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 // Arithmetic operators for float16 on ARMv8.2-A CPU
 #elif defined(PADDLE_WITH_NATIVE_FP16)
-HOST inline float16 operator+(const float16& a, const float16& b) {
+inline float16 operator+(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -616,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
  return res;
 }
-HOST inline float16 operator-(const float16& a, const float16& b) {
+inline float16 operator-(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -632,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
  return res;
 }
-HOST inline float16 operator*(const float16& a, const float16& b) {
+inline float16 operator*(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -648,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
  return res;
 }
-HOST inline float16 operator/(const float16& a, const float16& b) {
+inline float16 operator/(const float16& a, const float16& b) {
  float16 res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -664,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
  return res;
 }
-HOST inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
  float16 res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -679,27 +679,27 @@ HOST inline float16 operator-(const float16& a) {
  return res;
 }
-HOST inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {
  a = a + b;
  return a;
 }
-HOST inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {
  a = a - b;
  return a;
 }
-HOST inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {
  a = a * b;
  return a;
 }
-HOST inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {
  a = a / b;
  return a;
 }
-HOST inline bool operator==(const float16& a, const float16& b) {
+inline bool operator==(const float16& a, const float16& b) {
  uint16_t res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -715,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }
-HOST inline bool operator!=(const float16& a, const float16& b) {
+inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
-  return !(a == b);
-}
-HOST inline bool operator<(const float16& a, const float16& b) {
+inline bool operator<(const float16& a, const float16& b) {
  uint16_t res;
  asm volatile(
      "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -735,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }
-HOST inline bool operator<=(const float16& a, const float16& b) {
+inline bool operator<=(const float16& a, const float16& b) {
  uint16_t res;
  asm volatile(
      "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -751,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }
-HOST inline bool operator>(const float16& a, const float16& b) {
+inline bool operator>(const float16& a, const float16& b) {
  uint16_t res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -767,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
  return (res & 0xffff) != 0;
 }
-HOST inline bool operator>=(const float16& a, const float16& b) {
+inline bool operator>=(const float16& a, const float16& b) {
  uint16_t res;
  asm volatile(
      "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -785,69 +783,69 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
 // Arithmetic operators for float16, software emulated on other CPU
 #else
-HOST inline float16 operator+(const float16& a, const float16& b) {
+inline float16 operator+(const float16& a, const float16& b) {
  return float16(float(a) + float(b));
 }
-HOST inline float16 operator-(const float16& a, const float16& b) {
+inline float16 operator-(const float16& a, const float16& b) {
  return float16(float(a) - float(b));
 }
-HOST inline float16 operator*(const float16& a, const float16& b) {
+inline float16 operator*(const float16& a, const float16& b) {
  return float16(float(a) * float(b));
 }
-HOST inline float16 operator/(const float16& a, const float16& b) {
+inline float16 operator/(const float16& a, const float16& b) {
  return float16(float(a) / float(b));
 }
-HOST inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
  float16 res;
  res.x = a.x ^ 0x8000;
  return res;
 }
-HOST inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {
  a = float16(float(a) + float(b));
  return a;
 }
-HOST inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {
  a = float16(float(a) - float(b));
  return a;
 }
-HOST inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {
  a = float16(float(a) * float(b));
  return a;
 }
-HOST inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {
  a = float16(float(a) / float(b));
  return a;
 }
-HOST inline bool operator==(const float16& a, const float16& b) {
+inline bool operator==(const float16& a, const float16& b) {
  return float(a) == float(b);
 }
-HOST inline bool operator!=(const float16& a, const float16& b) {
+inline bool operator!=(const float16& a, const float16& b) {
  return float(a) != float(b);
 }
-HOST inline bool operator<(const float16& a, const float16& b) {
+inline bool operator<(const float16& a, const float16& b) {
  return float(a) < float(b);
 }
-HOST inline bool operator<=(const float16& a, const float16& b) {
+inline bool operator<=(const float16& a, const float16& b) {
  return float(a) <= float(b);
 }
-HOST inline bool operator>(const float16& a, const float16& b) {
+inline bool operator>(const float16& a, const float16& b) {
  return float(a) > float(b);
 }
-HOST inline bool operator>=(const float16& a, const float16& b) {
+inline bool operator>=(const float16& a, const float16& b) {
  return float(a) >= float(b);
 }
 #endif

--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -13,158 +13,243 @@
 # limitations under the License.
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
-class TestElementwiseOp(OpTest):
+class TestElementwiseAddOp(OpTest):
    def setUp(self):
        self.op_type = "elementwise_add"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_axis()
        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Out': self.out}
    def test_check_output(self):
        self.check_output()
    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
    def test_check_grad_ingore_x(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(
            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
    def test_check_grad_ingore_y(self):
+        if self.dtype == np.float16:
+            return
        self.check_grad(
            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
-class TestElementwiseAddOp_scalar(TestElementwiseOp):
+    def init_dtype(self):
-    def setUp(self):
+        pass
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+    def init_axis(self):
+        pass
-class TestElementwiseAddOp_scalar2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1, 1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
-class TestElementwiseAddOp_Vector(TestElementwiseOp):
+    def test_check_output(self):
-    def setUp(self):
+        if core.is_compiled_with_cuda():
-        self.op_type = "elementwise_add"
+            place = core.CUDAPlace(0)
-        self.inputs = {
+            if core.is_float16_supported(place):
-            'X': np.random.random((32, )).astype("float32"),
+                self.check_output_with_place(place, atol=1e-3)
-            'Y': np.random.random((32, )).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
-class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
-    def setUp(self):
+    def init_input_output(self):
-        self.op_type = "elementwise_add"
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.inputs = {
+        self.y = np.random.rand(1).astype(self.dtype)
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+        self.out = self.x + self.y
-            'Y': np.random.rand(2).astype(np.float32)
-        }
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
-        }
+class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
-class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
-        }
-        self.attrs = {'axis': 1}
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
-        self.outputs = {
+    def init_input_output(self):
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        }
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
-class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
+class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
-    def setUp(self):
+    def init_input_output(self):
-        self.op_type = "elementwise_add"
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.inputs = {
+        self.y = np.random.rand(1, 1).astype(self.dtype)
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+        self.out = self.x + self.y
-            'Y': np.random.rand(4).astype(np.float32)
-        }
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
-        }
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
-class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
-        self.attrs = {'axis': 1}
+class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
-        self.outputs = {
+    def init_input_output(self):
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
+        self.x = np.random.random((32, )).astype(self.dtype)
-        }
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
-class TestElementwiseAddOp_broadcast_4(TestElementwiseOp):
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
-    def setUp(self):
+    def init_input_output(self):
-        self.op_type = "elementwise_add"
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.inputs = {
+        self.y = np.random.rand(2).astype(self.dtype)
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+        self.out = self.x + self.y.reshape(2, 1, 1)
-            'Y': np.random.rand(2, 1).astype(np.float32)
-        }
-        self.attrs = {'axis': 0}
+    def init_axis(self):
-        self.outputs = {
+        self.axis = 0
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1, 1)
-        }
-class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
+class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
-    def setUp(self):
+    def init_input_output(self):
-        self.op_type = "elementwise_add"
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.inputs = {
+        self.y = np.random.rand(2).astype(self.dtype)
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
+        self.out = self.x + self.y.reshape(2, 1, 1)
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
-        self.attrs = {'axis': 1}
+    def init_axis(self):
-        self.outputs = {
+        self.axis = 0
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
-        }
-class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
-    def setUp(self):
+    def init_input_output(self):
-        self.op_type = "elementwise_add"
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
-        self.inputs = {
+        self.y = np.random.rand(3).astype(self.dtype)
-            'X': np.random.rand(2, 1).astype(np.float32),
+        self.out = self.x + self.y.reshape(1, 3, 1)
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.attrs = {'axis': 1}
+    def init_axis(self):
-        self.outputs = {
+        self.axis = 1
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
-        }
+class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1)
+    def init_axis(self):
+        self.axis = 1
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+    def init_axis(self):
+        self.axis = 1
+class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+    def init_axis(self):
+        self.axis = 1
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+    def init_axis(self):
+        self.axis = 0
+class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+    def init_axis(self):
+        self.axis = 0
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+    def init_axis(self):
+        self.axis = 1
+class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+    def init_axis(self):
+        self.axis = 1
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+    def init_axis(self):
+        self.axis = 1
+class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+    def init_axis(self):
+        self.axis = 1
 if __name__ == '__main__':