support some prim ops bf16 dtype (#54399)

791963ab · Charles-hit · GitHub · 2f0b4ad0 · 791963ab · 791963ab
7 changed file
--- a/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_sum_kernel.cc
@@ -49,6 +49,7 @@ PD_REGISTER_KERNEL(sum_raw,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int16_t,
                   int,
                   int64_t,

--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -44,6 +44,7 @@ PD_REGISTER_KERNEL(sum,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int16_t,
                   int,
                   int64_t,

--- a/test/legacy_test/test_concat_op.py
+++ b/test/legacy_test/test_concat_op.py
@@ -136,8 +136,8 @@ class TestConcatOp6(TestConcatOp):
        self.dtype = self.get_dtype()
        self.python_api = paddle.concat
        self.public_python_api = paddle.concat
-        self.enable_cinn = False
        self.init_test_data()
+        self.if_enable_cinn()
        self.lod = [[20, 80]]
        self.out_lod = [[20, 80, 20, 80, 20, 80]]
        self.inputs = {
@@ -156,6 +156,9 @@ class TestConcatOp6(TestConcatOp):
        out = np.concatenate((self.x0, self.x1, self.x2), axis=self.actual_axis)
        self.outputs = {'Out': (out, self.out_lod)}
+    def if_enable_cinn(self):
+        pass
    def test_check_output(self):
        self.check_output()
@@ -177,7 +180,7 @@ class TestConcatOp7(TestConcatOp):
        self.python_api = paddle.concat
        self.public_python_api = paddle.concat
        self.prim_op_type = "prim"
-        self.enable_cinn = True
+        self.if_enable_cinn()
        self.dtype = self.get_dtype()
        self.init_test_data()
        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
@@ -194,6 +197,9 @@ class TestConcatOp7(TestConcatOp):
            )
        }
+    def if_enable_cinn(self):
+        pass
    def get_dtype(self):
        return "float64"
@@ -226,7 +232,6 @@ def create_test_AxisTensor(parent):
            self.op_type = "concat"
            self.python_api = paddle.concat
            self.public_python_api = paddle.concat
-            self.enable_cinn = False
            self.dtype = self.get_dtype()
            self.init_test_data()
            self.inputs = {
@@ -286,7 +291,6 @@ def create_test_fp16(parent):
            self.op_type = "concat"
            self.python_api = paddle.concat
            self.public_python_api = paddle.concat
-            self.enable_cinn = False
            self.dtype = self.get_dtype()
            self.init_test_data()
            self.inputs = {

--- a/test/legacy_test/test_elementwise_max_op.py
+++ b/test/legacy_test/test_elementwise_max_op.py
@@ -198,7 +198,6 @@ class TestElementwiseBF16Op(OpTest):
        self.python_api = paddle.maximum
        self.public_python_api = paddle.maximum
        self.prim_op_type = "prim"
-        self.enable_cinn = False
        self.dtype = np.uint16
        self.inputs = {
            'X': convert_float_to_uint16(self.x),
@@ -207,6 +206,7 @@ class TestElementwiseBF16Op(OpTest):
        self.outputs = {
            'Out': convert_float_to_uint16(np.maximum(self.x, self.y))
        }
+        self.if_enable_cinn()
    def test_check_output(self):
        if hasattr(self, 'attrs'):
@@ -214,6 +214,9 @@ class TestElementwiseBF16Op(OpTest):
        else:
            self.check_output(check_dygraph=True)
+    def if_enable_cinn(self):
+        pass
    def test_check_grad_normal(self):
        if hasattr(self, 'attrs'):
            # check_prim=False, bfloat16 is not supported in `less_equal`
@@ -221,16 +224,26 @@ class TestElementwiseBF16Op(OpTest):
                ['X', 'Y'], 'Out', numeric_grad_delta=0.05, check_dygraph=False
            )
        else:
-            self.check_grad(['X', 'Y'], 'Out', numeric_grad_delta=0.05)
+            self.check_grad(
+                ['X', 'Y'], 'Out', numeric_grad_delta=0.05, check_prim=True
+            )
    def test_check_grad_ingore_x(self):
        self.check_grad(
-            ['Y'], 'Out', numeric_grad_delta=0.05, no_grad_set=set("X")
+            ['Y'],
+            'Out',
+            numeric_grad_delta=0.05,
+            no_grad_set=set("X"),
+            check_prim=True,
        )
    def test_check_grad_ingore_y(self):
        self.check_grad(
-            ['X'], 'Out', numeric_grad_delta=0.05, no_grad_set=set('Y')
+            ['X'],
+            'Out',
+            numeric_grad_delta=0.05,
+            no_grad_set=set('Y'),
+            check_prim=True,
        )

--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -19,6 +19,7 @@ from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 def pow_grad(x, y, dout):
@@ -270,8 +271,10 @@ class TestElementwisePowOpFP16(OpTest):
 class TestElementwisePowBF16Op(OpTest):
    def setUp(self):
        self.op_type = "elementwise_pow"
+        self.prim_op_type = "prim"
        self.dtype = np.uint16
        self.python_api = paddle.pow
+        self.public_python_api = paddle.pow
        x = np.random.uniform(0, 1, [20, 5]).astype(np.float32)
        y = np.random.uniform(0, 1, [20, 5]).astype(np.float32)
@@ -290,6 +293,14 @@ class TestElementwisePowBF16Op(OpTest):
    def test_check_grad(self):
        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_cuda():
+            self.check_grad_with_place(
+                core.CUDAPlace(0),
+                ['X', 'Y'],
+                'Out',
+                check_prim=True,
+                only_check_prim=True,
+            )
 if __name__ == '__main__':

--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
@@ -15,7 +15,11 @@
 import unittest
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 import paddle
 from paddle import fluid
@@ -31,6 +35,7 @@ class TestGatherNdOpWithEmptyIndex(OpTest):
        self.python_api = paddle.gather_nd
        self.public_python_api = paddle.gather_nd
        self.config_dtype()
+        self.if_enable_cinn()
        if self.dtype == np.float64:
            target_dtype = "float64"
        elif self.dtype == np.float16:
@@ -45,6 +50,9 @@ class TestGatherNdOpWithEmptyIndex(OpTest):
        self.inputs = {'X': xnp, 'Index': np.array([[], []]).astype("int32")}
        self.outputs = {'Out': output}
+    def if_enable_cinn(self):
+        pass
    def config_dtype(self):
        self.dtype = np.float64
@@ -85,6 +93,7 @@ class TestGatherNdOpWithIndex1(OpTest):
        self.python_api = paddle.gather_nd
        self.public_python_api = paddle.gather_nd
        self.config_dtype()
+        self.if_enable_cinn()
        if self.dtype == np.float64:
            target_dtype = "float64"
        elif self.dtype == np.float16:
@@ -100,6 +109,9 @@ class TestGatherNdOpWithIndex1(OpTest):
        self.inputs = {'X': xnp, 'Index': index}
        self.outputs = {'Out': output}
+    def if_enable_cinn(self):
+        pass
    def config_dtype(self):
        self.dtype = np.float64
@@ -189,7 +201,9 @@ class TestGatherNdOpWithLowIndexBF16(TestGatherNdOpWithLowIndex):
    def test_check_grad(self):
        place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+        )
 class TestGatherNdOpIndex1(OpTest):
@@ -208,6 +222,8 @@ class TestGatherNdOpIndex1(OpTest):
        else:
            target_dtype = "float32"
        xnp = np.random.uniform(0, 100, (10, 10)).astype(target_dtype)
+        if self.dtype == np.uint16:
+            xnp = convert_uint16_to_float(convert_float_to_uint16(xnp))
        index = np.array([1, 2]).astype("int32")
        output = xnp[tuple(index.T)]
        if self.dtype == np.uint16:
@@ -215,6 +231,9 @@ class TestGatherNdOpIndex1(OpTest):
            output = convert_float_to_uint16(output)
        self.inputs = {'X': xnp, 'Index': index}
        self.outputs = {'Out': output}
+        self.if_enable_cinn()
+    def if_enable_cinn(self):
        # the outputs are 0D-tensor, CINN not support
        self.enable_cinn = False
@@ -225,7 +244,7 @@ class TestGatherNdOpIndex1(OpTest):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(['X'], 'Out', check_prim=True, numeric_grad_delta=0.05)
 class TestGatherNdOpIndex1FP16(TestGatherNdOpIndex1):
@@ -248,7 +267,9 @@ class TestGatherNdOpIndex1BF16(TestGatherNdOpIndex1):
    def test_check_grad(self):
        place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+        )
 class TestGatherNdOpWithSameIndexAsX(OpTest):
@@ -304,7 +325,9 @@ class TestGatherNdOpWithSameIndexAsXBF16(TestGatherNdOpWithSameIndexAsX):
    def test_check_grad(self):
        place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', check_prim=True, numeric_grad_delta=0.5
+        )
 class TestGatherNdOpWithHighRankSame(OpTest):

--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -277,13 +277,16 @@ class TestMaxOp_ZeroDim(OpTest):
        self.prim_op_type = "prim"
        self.python_api = paddle.max
        self.public_python_api = paddle.max
-        self.enable_cinn = False
+        self.if_enable_cinn()
        self.inputs = {'X': np.random.random([]).astype("float64")}
        self.attrs = {'dim': []}
        self.outputs = {
            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
        }
+    def if_enable_cinn(self):
+        self.enable_cinn = False
    def test_check_output(self):
        self.check_output()