[AMP Prim OP]support dropout prim ops bfloat16 dtype (#54175)

* fix dropout api and support bf16 for prim * fix code style * fix dropout test * fix dropout p = 0 test

[AMP Prim OP]support dropout prim ops bfloat16 dtype (#54175)
* fix dropout api and support bf16 for prim * fix code style * fix dropout test * fix dropout p = 0 test
0df9e4ce · Charles-hit · GitHub · d4451cb0 · 0df9e4ce
隐藏空白更改
内联并排

Showing with 91 addition and 26 deletion

test/legacy_test/test_dropout_op.py test/legacy_test/test_dropout_op.py +91 -26

未找到文件。
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -311,10 +311,6 @@ class TestFP16DropoutOp(OpTest):
            'is_test': True,
        }
        self.outputs = {'Out': out}
-        # Because prim op compare res with dygraph
-        # when p = 0 dropout api return x,in dygraph mode x_grad = out_grad,
-        # but in static mode x_grad = []
-        self.enable_check_static_comp = False
    def init_test_case(self):
        self.input_size = [32, 64]
@@ -362,22 +358,10 @@ class TestBF16DropoutOp(OpTest):
        }
    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_prim=True)
    def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_prim=True)
-    def test_check_output_for_prim(self):
-        # greater_equal does't support bfloat16 in cpu
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(core.CUDAPlace(0))
-    def test_check_grad_for_prim(self):
-        # greater_equal does't support bfloat16 in cpu
-        if core.is_compiled_with_cuda():
-            self.check_grad_with_place(
-                core.CUDAPlace(0), ['X'], 'Out', only_check_prim=True
-            )
 class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
@@ -1451,8 +1435,9 @@ class PrimNet(paddle.nn.Layer):
        training=True,
        mode="upscale_in_train",
    ):
+        y = paddle.assign(x)
        out = paddle.nn.functional.dropout(
-            x=x, p=p, axis=axis, training=training, mode=mode
+            x=y, p=p, axis=axis, training=training, mode=mode
        )
        return out
@@ -1476,6 +1461,16 @@ def apply_to_static(net, use_cinn):
            'float32',
            places,
        ),
+        (
+            'bfp16',
+            np.random.rand(100000),
+            0.3,
+            False,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
        (
            'fp64',
            np.random.rand(100000),
@@ -1506,6 +1501,16 @@ def apply_to_static(net, use_cinn):
            'float32',
            places,
        ),
+        (
+            'p=1.0,dtype=bfp16',
+            np.random.rand(100000),
+            1.0,
+            True,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
        (
            'p=1.0,test=False',
            np.random.rand(100000),
@@ -1517,15 +1522,35 @@ def apply_to_static(net, use_cinn):
            places,
        ),
        (
-            'p=0.0',
+            'p=1.0,test=False,dtype=bfp16',
            np.random.rand(100000),
            1.0,
+            False,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
+        (
+            'p=0.0',
+            np.random.rand(100000),
+            0,
            True,
            'upscale_in_train',
            1002,
            'float32',
            places,
        ),
+        (
+            'p=0.0,dtype=bfp16',
+            np.random.rand(100000),
+            0,
+            True,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
        (
            'downgrade_train',
            np.random.rand(100000),
@@ -1536,6 +1561,16 @@ def apply_to_static(net, use_cinn):
            'float32',
            places,
        ),
+        (
+            'downgrade_train,dtype=bfp16',
+            np.random.rand(100000),
+            0.5,
+            False,
+            'downscale_in_infer',
+            1002,
+            'bfloat16',
+            places,
+        ),
        (
            'fp32_cpu',
            np.random.rand(100000),
@@ -1571,7 +1606,11 @@ def apply_to_static(net, use_cinn):
 class TestCompositeDropout(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        cls.x = cls.x.astype(cls.dtype)
+        cls.x = (
+            cls.x.astype(cls.dtype)
+            if cls.dtype != "bfloat16"
+            else cls.x.astype("float32")
+        )
        core._set_prim_all_enabled(True)
    @classmethod
@@ -1596,12 +1635,18 @@ class TestCompositeDropout(unittest.TestCase):
            paddle.set_device("gpu")
        core.set_prim_eager_enabled(False)
        input_ = paddle.to_tensor(
-            data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            data=self.x,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+            place=place,
+            stop_gradient=False,
        )
        output = paddle.nn.functional.dropout(
            input_, self.p, training=(not self.is_test), mode=self.mode
        )
        grad = paddle.grad(output, input_)
+        if self.dtype == "bfloat16":
+            output = paddle.cast(output, "float32")
+            grad[0] = paddle.cast(grad[0], "float32")
        return output, grad[0]
    def test_static_comp(self):
@@ -1614,11 +1659,16 @@ class TestCompositeDropout(unittest.TestCase):
                mp, sp = paddle.static.Program(), paddle.static.Program()
                with paddle.static.program_guard(mp, sp):
                    input_ = paddle.static.data(
-                        'x', shape=self.x.shape, dtype=self.x.dtype
+                        'x',
+                        shape=self.x.shape,
+                        dtype=self.x.dtype
+                        if self.dtype != "bfloat16"
+                        else "float32",
                    )
                    input_.stop_gradient = False
+                    y = paddle.assign(input_)
                    output = paddle.nn.functional.dropout(
-                        input_,
+                        y,
                        self.p,
                        training=(not self.is_test),
                        mode=self.mode,
@@ -1626,6 +1676,9 @@ class TestCompositeDropout(unittest.TestCase):
                    if core._is_fwd_prim_enabled():
                        primapi.to_prim(mp.blocks)
                    grad = paddle.static.gradients(output, input_)[0]
+                    if self.dtype == "bfloat16":
+                        output = paddle.cast(output, "float32")
+                        grad = paddle.cast(grad, "float32")
                exe = paddle.static.Executor(place)
                exe.run(sp)
                fwd, rev = exe.run(
@@ -1662,7 +1715,10 @@ class TestCompositeDropout(unittest.TestCase):
                paddle.set_device("gpu")
            paddle.seed(self.seed)
            input_ = paddle.to_tensor(
-                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+                data=self.x,
+                dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+                place=place,
+                stop_gradient=False,
            )
            net = PrimNet()
            net = apply_to_static(net, False)
@@ -1670,6 +1726,9 @@ class TestCompositeDropout(unittest.TestCase):
                input_, self.p, training=(not self.is_test), mode=self.mode
            )
            grad = paddle.grad(output, input_)
+            if self.dtype == "bfloat16":
+                output = paddle.cast(output, "float32")
+                grad[0] = paddle.cast(grad[0], "float32")
            fwd_actual.append(output.numpy())
            rev_actual.append(grad[0].numpy())
        for i in range(len(self.places)):
@@ -1696,7 +1755,10 @@ class TestCompositeDropout(unittest.TestCase):
            paddle.set_device("gpu")
            paddle.seed(self.seed)
            input_ = paddle.to_tensor(
-                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+                data=self.x,
+                dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+                place=place,
+                stop_gradient=False,
            )
            net = PrimNet()
            net = apply_to_static(net, True)
@@ -1704,6 +1766,9 @@ class TestCompositeDropout(unittest.TestCase):
                input_, self.p, training=(not self.is_test), mode=self.mode
            )
            grad = paddle.grad(output, input_)
+            if self.dtype == "bfloat16":
+                output = paddle.cast(output, "float32")
+                grad[0] = paddle.cast(grad[0], "float32")
            fwd_actual.append(output.numpy())
            rev_actual.append(grad[0].numpy())
        i = 0