From 2cff9839915426d12375f6db851b31d01574137d Mon Sep 17 00:00:00 2001
From: Vvsmile <17864154871@163.com>
Date: Thu, 13 Apr 2023 14:23:54 +0800
Subject: [PATCH] [AMP OP&Test] adjust test_elementwise_sub's tolerance,
 max_relative_error of grad and (#50953)

* adjust test_elementwise_sub's tolerance, max_relative_error of grad and
atol/rtol of output to 1e-3

* fix the dtype in setUp

* fix the elementwise_sub optest

* modify elementwise_sub optest

* fix and add bf16/fp16 to elementwise_sub

* fix elementwise_sub bugs

* fix bugs

* fix elementwise_sub op

* fix the data type

* fix elementwise_sub

* fix elementwise

* fix elementwise_sub

* fix bugs

* fix elementwise sub

* fix elementwise_sub

* remove scalar and vector
---
 .../unittests/test_elementwise_sub_op.py      | 531 +++++++++++++++---
 1 file changed, 460 insertions(+), 71 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 1034e5fb59c..b979fa339de 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -21,6 +21,7 @@ from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 
 
@@ -30,14 +31,18 @@ class TestElementwiseOp(OpTest):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def test_check_output(self):
         self.check_output()
 
@@ -66,7 +71,55 @@ class TestElementwiseOp(OpTest):
         self.check_prim = True
 
     def if_enable_cinn(self):
-        pass
+        self.enable_cinn = False
+
+
+class TestElementwiseFP16OP(TestElementwiseOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.1
+        )
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), max_relative_error=0.1
+        )
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), max_relative_error=0.1
+        )
 
 
 class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
@@ -75,19 +128,45 @@ class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, []).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, []).astype("float64"),
+            'X': np.random.uniform(0.1, 1, []).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, []).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim1(TestElementwiseSubOp_ZeroDim1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseSubBF16OP_ZeroDim1(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, []).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, []).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
 class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
@@ -96,19 +175,45 @@ class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, []).astype("float64"),
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, []).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim2(TestElementwiseSubOp_ZeroDim2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseSubBF16OP_ZeroDim2(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, []).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
 class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
@@ -117,21 +222,52 @@ class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, []).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
+            'X': np.random.uniform(0.1, 1, []).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim3(TestElementwiseSubOp_ZeroDim3):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_ZeroDim3(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, []).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
 class TestBF16ElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
@@ -151,6 +287,12 @@ class TestBF16ElementwiseOp(OpTest):
         self.if_check_prim()
         self.if_enable_cinn()
 
+    def if_check_prim(self):
+        self.check_prim = True
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def test_check_output(self):
         self.check_output()
 
@@ -162,17 +304,6 @@ class TestBF16ElementwiseOp(OpTest):
             ['Y'], 'Out', no_grad_set=set("X"), check_prim=self.check_prim
         )
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', no_grad_set=set('Y'), check_prim=self.check_prim
-        )
-
-    def if_check_prim(self):
-        self.check_prim = True
-
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast."
@@ -183,9 +314,10 @@ class TestElementwiseSubOp_scalar(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 3, 4).astype(np.float64),
-            'Y': np.random.rand(1).astype(np.float64),
+            'X': np.random.rand(10, 3, 4).astype(self.dtype),
+            'Y': np.random.rand(1).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
@@ -197,21 +329,23 @@ class TestElementwiseSubOp_Vector(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.random((100,)).astype("float64"),
-            'Y': np.random.random((100,)).astype("float64"),
+            'X': np.random.random((100,)).astype(self.dtype),
+            'Y': np.random.random((100,)).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
 
-class TestElementwiseSubOp_broadcast_O(TestElementwiseOp):
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(100, 3, 2).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(100, 3, 2).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.attrs = {'axis': 0}
@@ -244,13 +378,66 @@ class TestElementwiseSubOp_broadcast_O(TestElementwiseOp):
         )
 
 
-class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp_broadcast_O):
+class TestElementwiseSubFP16OP_broadcast_0(TestElementwiseSubOp_broadcast_0):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_0(TestElementwiseBF16OP):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
         self.python_api = paddle.subtract
         self.inputs = {
-            'X': np.random.rand(2, 100, 3).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 0}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', check_dygraph=False
+        )
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), check_dygraph=False
+        )
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), check_dygraph=False
+        )
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.python_api = paddle.subtract
+        self.init_dtype()
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.attrs = {'axis': 1}
@@ -259,15 +446,46 @@ class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp_broadcast_O):
         }
 
 
+class TestElementwiseSubFP16OP_broadcast_1(TestElementwiseSubOp_broadcast_1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_1(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 1}
+
+
 class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 3, 100).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(2, 3, 100).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.outputs = {
@@ -275,17 +493,70 @@ class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
         }
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_broadcast_2(TestElementwiseSubOp_broadcast_2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_2(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_3(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 1}
 
 
-class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp_broadcast_O):
+class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp_broadcast_0):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
-            'Y': np.random.rand(10, 12).astype(np.float64),
+            'X': np.random.rand(2, 10, 12, 3).astype(self.dtype),
+            'Y': np.random.rand(10, 12).astype(self.dtype),
         }
 
         self.attrs = {'axis': 1}
@@ -294,21 +565,52 @@ class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp_broadcast_O):
         }
 
 
+class TestElementwiseSubFP16OP_broadcast_3(TestElementwiseSubOp_broadcast_3):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 5, 3, 12).astype(np.float64),
-            'Y': np.random.rand(2, 5, 1, 12).astype(np.float64),
+            'X': np.random.rand(2, 5, 3, 12).astype(self.dtype),
+            'Y': np.random.rand(2, 5, 1, 12).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_4(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+
+
+class TestElementwiseSubFP16OP_broadcast_4(TestElementwiseSubOp_broadcast_4):
+    def init_dtype(self):
+        self.dtype = np.float16
 
 
 class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
@@ -317,15 +619,43 @@ class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 3, 100).astype(np.float64),
-            'Y': np.random.rand(1, 1, 100).astype(np.float64),
+            'X': np.random.rand(2, 3, 100).astype(self.dtype),
+            'Y': np.random.rand(1, 1, 100).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_commonuse_1(TestElementwiseSubOp_commonuse_1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_commonuse_1(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
@@ -334,15 +664,43 @@ class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 3, 1, 4).astype(np.float64),
-            'Y': np.random.rand(10, 1, 12, 1).astype(np.float64),
+            'X': np.random.rand(10, 3, 1, 4).astype(self.dtype),
+            'Y': np.random.rand(10, 1, 12, 1).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_commonuse_2(TestElementwiseSubOp_commonuse_2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_commonuse_2(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
@@ -351,9 +709,10 @@ class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 12).astype(np.float64),
-            'Y': np.random.rand(2, 3, 10, 12).astype(np.float64),
+            'X': np.random.rand(10, 12).astype(self.dtype),
+            'Y': np.random.rand(2, 3, 10, 12).astype(self.dtype),
         }
         self.attrs = {'axis': 2}
 
@@ -362,8 +721,38 @@ class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
         }
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_xsize_lessthan_ysize(
+    TestElementwiseSubOp_xsize_lessthan_ysize
+):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_xsize_lessthan_ysize(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32),
+        }
+        self.attrs = {'axis': 2}
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestComplexElementwiseSubOp(OpTest):
@@ -473,7 +862,7 @@ class TestSubtractApi(unittest.TestCase):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
-            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+            y = paddle.static.data(name='y', shape=[2, 3], dtype=np.float32)
 
             y_1 = self._executed_api(x, y, name='subtract_res')
             self.assertEqual(('subtract_res' in y_1.name), True)
@@ -483,12 +872,12 @@ class TestSubtractApi(unittest.TestCase):
 
             def gen_data():
                 return {
-                    "x": np.array([2, 3, 4]).astype('float32'),
-                    "y": np.array([1, 5, 2]).astype('float32'),
+                    "x": np.array([2, 3, 4]).astype(np.float32),
+                    "y": np.array([1, 5, 2]).astype(np.float32),
                 }
 
-            x = paddle.static.data(name="x", shape=[3], dtype='float32')
-            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            x = paddle.static.data(name="x", shape=[3], dtype=np.float32)
+            y = paddle.static.data(name="y", shape=[3], dtype=np.float32)
             z = self._executed_api(x, y)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -640,7 +1029,7 @@ class TestTensorSubAPIWarnings(unittest.TestCase):
             paddle.enable_static()
             helper = LayerHelper("elementwise_sub")
             data = paddle.static.data(
-                name='data', shape=[None, 3, 32, 32], dtype='float32'
+                name='data', shape=[None, 3, 32, 32], dtype=np.float32
             )
             out = helper.create_variable_for_type_inference(dtype=data.dtype)
             os.environ['FLAGS_print_extra_attrs'] = "1"
-- 
GitLab