From 0df9e4ce4a04ceb7ca9ba812cecb90bc0700d8f8 Mon Sep 17 00:00:00 2001 From: Charles-hit <56987902+Charles-hit@users.noreply.github.com> Date: Thu, 1 Jun 2023 20:01:13 +0800 Subject: [PATCH] [AMP Prim OP]support dropout prim ops bfloat16 dtype (#54175) * fix dropout api and support bf16 for prim * fix code style * fix dropout test * fix dropout p = 0 test --- test/legacy_test/test_dropout_op.py | 117 +++++++++++++++++++++------- 1 file changed, 91 insertions(+), 26 deletions(-) diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py index 540d30da2e6..884bef2daa6 100644 --- a/test/legacy_test/test_dropout_op.py +++ b/test/legacy_test/test_dropout_op.py @@ -311,10 +311,6 @@ class TestFP16DropoutOp(OpTest): 'is_test': True, } self.outputs = {'Out': out} - # Because prim op compare res with dygraph - # when p = 0 dropout api return x,in dygraph mode x_grad = out_grad, - # but in static mode x_grad = [] - self.enable_check_static_comp = False def init_test_case(self): self.input_size = [32, 64] @@ -362,22 +358,10 @@ class TestBF16DropoutOp(OpTest): } def test_check_output(self): - self.check_output() + self.check_output(check_prim=True) def test_check_grad_normal(self): - self.check_grad(['X'], 'Out') - - def test_check_output_for_prim(self): - # greater_equal does't support bfloat16 in cpu - if core.is_compiled_with_cuda(): - self.check_output_with_place(core.CUDAPlace(0)) - - def test_check_grad_for_prim(self): - # greater_equal does't support bfloat16 in cpu - if core.is_compiled_with_cuda(): - self.check_grad_with_place( - core.CUDAPlace(0), ['X'], 'Out', only_check_prim=True - ) + self.check_grad(['X'], 'Out', check_prim=True) class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase): @@ -1451,8 +1435,9 @@ class PrimNet(paddle.nn.Layer): training=True, mode="upscale_in_train", ): + y = paddle.assign(x) out = paddle.nn.functional.dropout( - x=x, p=p, axis=axis, training=training, mode=mode + x=y, p=p, axis=axis, training=training, mode=mode ) return out @@ -1476,6 +1461,16 @@ def apply_to_static(net, use_cinn): 'float32', places, ), + ( + 'bfp16', + np.random.rand(100000), + 0.3, + False, + 'upscale_in_train', + 1002, + 'bfloat16', + places, + ), ( 'fp64', np.random.rand(100000), @@ -1506,6 +1501,16 @@ def apply_to_static(net, use_cinn): 'float32', places, ), + ( + 'p=1.0,dtype=bfp16', + np.random.rand(100000), + 1.0, + True, + 'upscale_in_train', + 1002, + 'bfloat16', + places, + ), ( 'p=1.0,test=False', np.random.rand(100000), @@ -1517,15 +1522,35 @@ def apply_to_static(net, use_cinn): places, ), ( - 'p=0.0', + 'p=1.0,test=False,dtype=bfp16', np.random.rand(100000), 1.0, + False, + 'upscale_in_train', + 1002, + 'bfloat16', + places, + ), + ( + 'p=0.0', + np.random.rand(100000), + 0, True, 'upscale_in_train', 1002, 'float32', places, ), + ( + 'p=0.0,dtype=bfp16', + np.random.rand(100000), + 0, + True, + 'upscale_in_train', + 1002, + 'bfloat16', + places, + ), ( 'downgrade_train', np.random.rand(100000), @@ -1536,6 +1561,16 @@ def apply_to_static(net, use_cinn): 'float32', places, ), + ( + 'downgrade_train,dtype=bfp16', + np.random.rand(100000), + 0.5, + False, + 'downscale_in_infer', + 1002, + 'bfloat16', + places, + ), ( 'fp32_cpu', np.random.rand(100000), @@ -1571,7 +1606,11 @@ def apply_to_static(net, use_cinn): class TestCompositeDropout(unittest.TestCase): @classmethod def setUpClass(cls): - cls.x = cls.x.astype(cls.dtype) + cls.x = ( + cls.x.astype(cls.dtype) + if cls.dtype != "bfloat16" + else cls.x.astype("float32") + ) core._set_prim_all_enabled(True) @classmethod @@ -1596,12 +1635,18 @@ class TestCompositeDropout(unittest.TestCase): paddle.set_device("gpu") core.set_prim_eager_enabled(False) input_ = paddle.to_tensor( - data=self.x, dtype=self.dtype, place=place, stop_gradient=False + data=self.x, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + place=place, + stop_gradient=False, ) output = paddle.nn.functional.dropout( input_, self.p, training=(not self.is_test), mode=self.mode ) grad = paddle.grad(output, input_) + if self.dtype == "bfloat16": + output = paddle.cast(output, "float32") + grad[0] = paddle.cast(grad[0], "float32") return output, grad[0] def test_static_comp(self): @@ -1614,11 +1659,16 @@ class TestCompositeDropout(unittest.TestCase): mp, sp = paddle.static.Program(), paddle.static.Program() with paddle.static.program_guard(mp, sp): input_ = paddle.static.data( - 'x', shape=self.x.shape, dtype=self.x.dtype + 'x', + shape=self.x.shape, + dtype=self.x.dtype + if self.dtype != "bfloat16" + else "float32", ) input_.stop_gradient = False + y = paddle.assign(input_) output = paddle.nn.functional.dropout( - input_, + y, self.p, training=(not self.is_test), mode=self.mode, @@ -1626,6 +1676,9 @@ class TestCompositeDropout(unittest.TestCase): if core._is_fwd_prim_enabled(): primapi.to_prim(mp.blocks) grad = paddle.static.gradients(output, input_)[0] + if self.dtype == "bfloat16": + output = paddle.cast(output, "float32") + grad = paddle.cast(grad, "float32") exe = paddle.static.Executor(place) exe.run(sp) fwd, rev = exe.run( @@ -1662,7 +1715,10 @@ class TestCompositeDropout(unittest.TestCase): paddle.set_device("gpu") paddle.seed(self.seed) input_ = paddle.to_tensor( - data=self.x, dtype=self.dtype, place=place, stop_gradient=False + data=self.x, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + place=place, + stop_gradient=False, ) net = PrimNet() net = apply_to_static(net, False) @@ -1670,6 +1726,9 @@ class TestCompositeDropout(unittest.TestCase): input_, self.p, training=(not self.is_test), mode=self.mode ) grad = paddle.grad(output, input_) + if self.dtype == "bfloat16": + output = paddle.cast(output, "float32") + grad[0] = paddle.cast(grad[0], "float32") fwd_actual.append(output.numpy()) rev_actual.append(grad[0].numpy()) for i in range(len(self.places)): @@ -1696,7 +1755,10 @@ class TestCompositeDropout(unittest.TestCase): paddle.set_device("gpu") paddle.seed(self.seed) input_ = paddle.to_tensor( - data=self.x, dtype=self.dtype, place=place, stop_gradient=False + data=self.x, + dtype=self.dtype if self.dtype != "bfloat16" else "float32", + place=place, + stop_gradient=False, ) net = PrimNet() net = apply_to_static(net, True) @@ -1704,6 +1766,9 @@ class TestCompositeDropout(unittest.TestCase): input_, self.p, training=(not self.is_test), mode=self.mode ) grad = paddle.grad(output, input_) + if self.dtype == "bfloat16": + output = paddle.cast(output, "float32") + grad[0] = paddle.cast(grad[0], "float32") fwd_actual.append(output.numpy()) rev_actual.append(grad[0].numpy()) i = 0 -- GitLab