diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index 540d30da2e6cad2dd8f8b1a58d485ce4bb8149d9..884bef2daa6f24cc5a0427183fb77ec282d69909 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -311,10 +311,6 @@ class TestFP16DropoutOp(OpTest):
             'is_test': True,
         }
         self.outputs = {'Out': out}
-        # Because prim op compare res with dygraph
-        # when p = 0 dropout api return x,in dygraph mode x_grad = out_grad,
-        # but in static mode x_grad = []
-        self.enable_check_static_comp = False
 
     def init_test_case(self):
         self.input_size = [32, 64]
@@ -362,22 +358,10 @@ class TestBF16DropoutOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_prim=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
-
-    def test_check_output_for_prim(self):
-        # greater_equal does't support bfloat16 in cpu
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(core.CUDAPlace(0))
-
-    def test_check_grad_for_prim(self):
-        # greater_equal does't support bfloat16 in cpu
-        if core.is_compiled_with_cuda():
-            self.check_grad_with_place(
-                core.CUDAPlace(0), ['X'], 'Out', only_check_prim=True
-            )
+        self.check_grad(['X'], 'Out', check_prim=True)
 
 
 class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase):
@@ -1451,8 +1435,9 @@ class PrimNet(paddle.nn.Layer):
         training=True,
         mode="upscale_in_train",
     ):
+        y = paddle.assign(x)
         out = paddle.nn.functional.dropout(
-            x=x, p=p, axis=axis, training=training, mode=mode
+            x=y, p=p, axis=axis, training=training, mode=mode
         )
         return out
 
@@ -1476,6 +1461,16 @@ def apply_to_static(net, use_cinn):
             'float32',
             places,
         ),
+        (
+            'bfp16',
+            np.random.rand(100000),
+            0.3,
+            False,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
         (
             'fp64',
             np.random.rand(100000),
@@ -1506,6 +1501,16 @@ def apply_to_static(net, use_cinn):
             'float32',
             places,
         ),
+        (
+            'p=1.0,dtype=bfp16',
+            np.random.rand(100000),
+            1.0,
+            True,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
         (
             'p=1.0,test=False',
             np.random.rand(100000),
@@ -1517,15 +1522,35 @@ def apply_to_static(net, use_cinn):
             places,
         ),
         (
-            'p=0.0',
+            'p=1.0,test=False,dtype=bfp16',
             np.random.rand(100000),
             1.0,
+            False,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
+        (
+            'p=0.0',
+            np.random.rand(100000),
+            0,
             True,
             'upscale_in_train',
             1002,
             'float32',
             places,
         ),
+        (
+            'p=0.0,dtype=bfp16',
+            np.random.rand(100000),
+            0,
+            True,
+            'upscale_in_train',
+            1002,
+            'bfloat16',
+            places,
+        ),
         (
             'downgrade_train',
             np.random.rand(100000),
@@ -1536,6 +1561,16 @@ def apply_to_static(net, use_cinn):
             'float32',
             places,
         ),
+        (
+            'downgrade_train,dtype=bfp16',
+            np.random.rand(100000),
+            0.5,
+            False,
+            'downscale_in_infer',
+            1002,
+            'bfloat16',
+            places,
+        ),
         (
             'fp32_cpu',
             np.random.rand(100000),
@@ -1571,7 +1606,11 @@ def apply_to_static(net, use_cinn):
 class TestCompositeDropout(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
-        cls.x = cls.x.astype(cls.dtype)
+        cls.x = (
+            cls.x.astype(cls.dtype)
+            if cls.dtype != "bfloat16"
+            else cls.x.astype("float32")
+        )
         core._set_prim_all_enabled(True)
 
     @classmethod
@@ -1596,12 +1635,18 @@ class TestCompositeDropout(unittest.TestCase):
             paddle.set_device("gpu")
         core.set_prim_eager_enabled(False)
         input_ = paddle.to_tensor(
-            data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            data=self.x,
+            dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+            place=place,
+            stop_gradient=False,
         )
         output = paddle.nn.functional.dropout(
             input_, self.p, training=(not self.is_test), mode=self.mode
         )
         grad = paddle.grad(output, input_)
+        if self.dtype == "bfloat16":
+            output = paddle.cast(output, "float32")
+            grad[0] = paddle.cast(grad[0], "float32")
         return output, grad[0]
 
     def test_static_comp(self):
@@ -1614,11 +1659,16 @@ class TestCompositeDropout(unittest.TestCase):
                 mp, sp = paddle.static.Program(), paddle.static.Program()
                 with paddle.static.program_guard(mp, sp):
                     input_ = paddle.static.data(
-                        'x', shape=self.x.shape, dtype=self.x.dtype
+                        'x',
+                        shape=self.x.shape,
+                        dtype=self.x.dtype
+                        if self.dtype != "bfloat16"
+                        else "float32",
                     )
                     input_.stop_gradient = False
+                    y = paddle.assign(input_)
                     output = paddle.nn.functional.dropout(
-                        input_,
+                        y,
                         self.p,
                         training=(not self.is_test),
                         mode=self.mode,
@@ -1626,6 +1676,9 @@ class TestCompositeDropout(unittest.TestCase):
                     if core._is_fwd_prim_enabled():
                         primapi.to_prim(mp.blocks)
                     grad = paddle.static.gradients(output, input_)[0]
+                    if self.dtype == "bfloat16":
+                        output = paddle.cast(output, "float32")
+                        grad = paddle.cast(grad, "float32")
                 exe = paddle.static.Executor(place)
                 exe.run(sp)
                 fwd, rev = exe.run(
@@ -1662,7 +1715,10 @@ class TestCompositeDropout(unittest.TestCase):
                 paddle.set_device("gpu")
             paddle.seed(self.seed)
             input_ = paddle.to_tensor(
-                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+                data=self.x,
+                dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+                place=place,
+                stop_gradient=False,
             )
             net = PrimNet()
             net = apply_to_static(net, False)
@@ -1670,6 +1726,9 @@ class TestCompositeDropout(unittest.TestCase):
                 input_, self.p, training=(not self.is_test), mode=self.mode
             )
             grad = paddle.grad(output, input_)
+            if self.dtype == "bfloat16":
+                output = paddle.cast(output, "float32")
+                grad[0] = paddle.cast(grad[0], "float32")
             fwd_actual.append(output.numpy())
             rev_actual.append(grad[0].numpy())
         for i in range(len(self.places)):
@@ -1696,7 +1755,10 @@ class TestCompositeDropout(unittest.TestCase):
             paddle.set_device("gpu")
             paddle.seed(self.seed)
             input_ = paddle.to_tensor(
-                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+                data=self.x,
+                dtype=self.dtype if self.dtype != "bfloat16" else "float32",
+                place=place,
+                stop_gradient=False,
             )
             net = PrimNet()
             net = apply_to_static(net, True)
@@ -1704,6 +1766,9 @@ class TestCompositeDropout(unittest.TestCase):
                 input_, self.p, training=(not self.is_test), mode=self.mode
             )
             grad = paddle.grad(output, input_)
+            if self.dtype == "bfloat16":
+                output = paddle.cast(output, "float32")
+                grad[0] = paddle.cast(grad[0], "float32")
             fwd_actual.append(output.numpy())
             rev_actual.append(grad[0].numpy())
         i = 0