diff --git a/test/amp/test_amp_api.py b/test/amp/test_amp_api.py
index 179236d909cbe0f0e1dc89fefb5f91ef3e9a11dd..acb47d3302e1975f85c646aa8c62a0e4cd969a9a 100644
--- a/test/amp/test_amp_api.py
+++ b/test/amp/test_amp_api.py
@@ -18,6 +18,7 @@ import numpy as np
 from amp_base_models import AmpTestBase
 
 import paddle
+import paddle.nn.functional as F
 from paddle import nn
 from paddle.static import amp
 
@@ -153,5 +154,88 @@ class TestGradScaler(AmpTestBase):
         self.assertTrue('check_finite_and_unscale' not in op_list)
 
 
+class TestFp16Guard(AmpTestBase):
+    def test_fp16_gurad(self):
+        paddle.enable_static()
+
+        def run_example_code():
+            place = paddle.CUDAPlace(0)
+            main_program = paddle.static.Program()
+            startup_program = paddle.static.Program()
+
+            exe = paddle.static.Executor(place)
+
+            fetch_vars = []
+            # 1) Use fp16_guard to control the range of fp16 kernels used.
+            with paddle.static.program_guard(main_program, startup_program):
+                with paddle.static.amp.fp16_guard():
+                    data = paddle.static.data(
+                        name='X', shape=[None, 1, 28, 28], dtype='float32'
+                    )
+                    conv2d = paddle.static.nn.conv2d(
+                        input=data, num_filters=6, filter_size=3
+                    )
+                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+
+                pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                hidden = paddle.static.nn.fc(pool, size=10)
+                loss = paddle.mean(hidden)
+                fetch_vars = [loss]
+                # 2) Create the optimizer and set `multi_precision` to True.
+                # Setting `multi_precision` to True can avoid the poor accuracy
+                # or the slow convergence in a way.
+                optimizer = paddle.optimizer.Momentum(
+                    learning_rate=0.01, multi_precision=True
+                )
+                # 3) These ops in `custom_black_list` will keep in the float32 computation type.
+                amp_list = paddle.static.amp.CustomOpLists(
+                    custom_black_list=['pool2d']
+                )
+                # 4) The entry of Paddle AMP.
+                # Enable pure fp16 training by setting `use_pure_fp16` to True.
+                optimizer = paddle.static.amp.decorate(
+                    optimizer,
+                    amp_list,
+                    init_loss_scaling=128.0,
+                    use_dynamic_loss_scaling=True,
+                    use_pure_fp16=True,
+                )
+                # If you don't use the default_startup_program(), you sholud pass
+                # your defined `startup_program` into `minimize`.
+                optimizer.minimize(loss)
+
+            exe.run(startup_program)
+            # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+            # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+            optimizer.amp_init(place, scope=paddle.static.global_scope())
+
+            x_fp32 = np.random.random(size=[1, 1, 28, 28]).astype("float32")
+            (loss_data,) = exe.run(
+                main_program, feed={"X": x_fp32}, fetch_list=[loss.name]
+            )
+
+            self.assertEqual(
+                paddle.static.global_scope()
+                .find_var("conv2d_0.b_0")
+                .get_tensor()
+                ._dtype(),
+                paddle.float16,
+            )
+            self.assertEqual(
+                paddle.static.global_scope()
+                .find_var("fc_0.b_0")
+                .get_tensor()
+                ._dtype(),
+                paddle.float32,
+            )
+
+        if (
+            paddle.is_compiled_with_cuda()
+            and len(paddle.static.cuda_places()) > 0
+        ):
+            run_example_code()
+        paddle.disable_static()
+
+
 if __name__ == '__main__':
     unittest.main()