From bd1a240b7105231bef019b94562193dffe5e7400 Mon Sep 17 00:00:00 2001
From: Charles-hit <56987902+Charles-hit@users.noreply.github.com>
Date: Wed, 14 Jun 2023 15:16:14 +0800
Subject: [PATCH] [cherry-pick]Fix cuda12 test (#54622)

* [AMP Prim OP]support some prim ops for bf16 dtype part3 (#54368)

* support some prim ops bf16 dtype

* fix cmake

* [AMP Prim OP]support some prim ops for bf16 dtype part5 (#54422)

* support some prim ops for bf16 dtype

* remove useless code

* support some prim ops bf16 dtype (#54399)
---
 test/legacy_test/CMakeLists.txt               |  7 +-
 test/legacy_test/test_assign_op.py            |  1 -
 test/legacy_test/test_elementwise_min_op.py   | 25 +-----
 test/legacy_test/test_erf_op.py               | 16 ++--
 test/legacy_test/test_fill_any_like_op.py     |  2 +-
 .../test_flatten_contiguous_range_op.py       | 29 ++++++-
 test/legacy_test/test_index_select_op.py      | 14 +++-
 test/legacy_test/test_pad_op.py               |  7 +-
 test/legacy_test/test_roll_op.py              |  3 -
 test/legacy_test/test_scatter_op.py           | 76 ++++++++++++++----
 test/legacy_test/test_split_op.py             |  3 +-
 test/legacy_test/test_squeeze2_op.py          | 77 +++++++++++++++++--
 test/legacy_test/test_stack_op.py             |  4 +-
 test/legacy_test/test_tile_op.py              | 18 +++--
 test/legacy_test/test_top_k_v2_op.py          | 26 ++++++-
 test/legacy_test/test_transpose_op.py         | 14 +++-
 test/legacy_test/test_unsqueeze2_op.py        |  9 ++-
 17 files changed, 248 insertions(+), 83 deletions(-)

diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 4b176cfd533..b3d630d2d52 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -1197,7 +1197,12 @@ set(TEST_CINN_OPS
     test_scatter_nd_op
     test_strided_slice_op
     test_instance_norm_op
-    test_cumsum_op)
+    test_cumsum_op
+    test_pad_op
+    test_split_op
+    test_erf_op
+    test_assign_op
+    test_flatten_contiguous_range_op)
 
 foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
   if(WITH_CINN)
diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index 22efd0ac661..9069b11669d 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -80,7 +80,6 @@ class TestAssignBFP16Op(eager_op_test.OpTest):
         self.public_python_api = paddle.assign
         self.op_type = "assign"
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         x = np.random.uniform(0, 1, [100, 10]).astype(np.float32)
         x = convert_float_to_uint16(x)
         self.inputs = {'X': x}
diff --git a/test/legacy_test/test_elementwise_min_op.py b/test/legacy_test/test_elementwise_min_op.py
index fb03a6831ad..9ba527ef803 100644
--- a/test/legacy_test/test_elementwise_min_op.py
+++ b/test/legacy_test/test_elementwise_min_op.py
@@ -127,18 +127,12 @@ class TestElementwiseMinOp_ZeroDim1(TestElementwiseOp):
         self.inputs = {'X': x, 'Y': y}
         self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMinFP16Op_ZeroDim1(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype(np.float16)
         self.y = np.random.uniform(0.1, 1, []).astype(np.float16)
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMinOp_ZeroDim2(TestElementwiseOp):
     def setUp(self):
@@ -146,24 +140,17 @@ class TestElementwiseMinOp_ZeroDim2(TestElementwiseOp):
         self.python_api = paddle.minimum
         self.public_python_api = paddle.minimum
         self.prim_op_type = "prim"
-        self.if_enable_cinn()
         x = np.random.uniform(0.1, 1, [13, 17]).astype("float64")
         y = np.random.uniform(0.1, 1, []).astype("float64")
         self.inputs = {'X': x, 'Y': y}
         self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMinFP16Op_ZeroDim2(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype("float16")
         self.y = np.random.uniform(0.1, 1, []).astype("float16")
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMinOp_ZeroDim3(TestElementwiseOp):
     def setUp(self):
@@ -177,18 +164,12 @@ class TestElementwiseMinOp_ZeroDim3(TestElementwiseOp):
         self.inputs = {'X': x, 'Y': y}
         self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 class TestElementwiseMinFP16Op_ZeroDim3(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype("float16")
         self.y = np.random.uniform(0.1, 1, [13, 17]).astype("float16")
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast."
@@ -388,7 +369,7 @@ class TestElementwiseBF16Op(OpTest):
     def test_check_grad_ingore_x(self):
         places = self._get_places()
         for place in places:
-            if type(place) is paddle.fluid.libpaddle.CPUPlace:
+            if isinstance(place, paddle.fluid.libpaddle.CPUPlace):
                 check_prim = False
             else:
                 check_prim = True
@@ -413,7 +394,7 @@ class TestElementwiseBF16Op(OpTest):
     def test_check_grad_ingore_y(self):
         places = self._get_places()
         for place in places:
-            if type(place) is paddle.fluid.libpaddle.CPUPlace:
+            if isinstance(place, paddle.fluid.libpaddle.CPUPlace):
                 check_prim = False
             else:
                 check_prim = True
@@ -436,7 +417,7 @@ class TestElementwiseBF16Op(OpTest):
             )
 
     def if_enable_cinn(self):
-        self.enable_cinn = False
+        pass
 
 
 class TestElementwiseMinBF16Op_ZeroDim1(TestElementwiseBF16Op):
diff --git a/test/legacy_test/test_erf_op.py b/test/legacy_test/test_erf_op.py
index a124a6839ac..b560859cd41 100644
--- a/test/legacy_test/test_erf_op.py
+++ b/test/legacy_test/test_erf_op.py
@@ -57,15 +57,17 @@ class TestErfLayer(unittest.TestCase):
         np.testing.assert_allclose(y_ref, y_test, rtol=1e-05)
 
     def test_case(self):
-        self._test_case(fluid.CPUPlace())
-        if fluid.is_compiled_with_cuda():
-            self._test_case(fluid.CUDAPlace(0))
+        with paddle.fluid.framework._static_guard():
+            self._test_case(fluid.CPUPlace())
+            if fluid.is_compiled_with_cuda():
+                self._test_case(fluid.CUDAPlace(0))
 
     def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = paddle.static.data('x', [3, 4])
-            y = paddle.erf(x, name='erf')
-            self.assertTrue('erf' in y.name)
+        with paddle.fluid.framework._static_guard():
+            with fluid.program_guard(fluid.Program()):
+                x = paddle.static.data('x', [3, 4])
+                y = paddle.erf(x, name='erf')
+                self.assertTrue('erf' in y.name)
 
 
 class TestErfFP16OP(OpTest):
diff --git a/test/legacy_test/test_fill_any_like_op.py b/test/legacy_test/test_fill_any_like_op.py
index 754e1318788..36cf77195cc 100644
--- a/test/legacy_test/test_fill_any_like_op.py
+++ b/test/legacy_test/test_fill_any_like_op.py
@@ -88,7 +88,7 @@ class TestFillAnyLikeOpBfloat16(OpTest):
         self.check_output_with_place(place, check_prim=True)
 
     def if_enable_cinn(self):
-        self.enable_cinn = False
+        pass
 
 
 class TestFillAnyLikeOpValue1(TestFillAnyLikeOp):
diff --git a/test/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py
index ea924ce6297..658f03979a9 100644
--- a/test/legacy_test/test_flatten_contiguous_range_op.py
+++ b/test/legacy_test/test_flatten_contiguous_range_op.py
@@ -30,7 +30,7 @@ class TestFlattenOp(OpTest):
         self.prim_op_type = "comp"
         self.start_axis = 0
         self.stop_axis = -1
-        self.skip_cinn()
+        self.if_enable_cinn()
         self.init_test_case()
         self.init_test_dtype()
         self.init_input_data()
@@ -40,8 +40,8 @@ class TestFlattenOp(OpTest):
             "XShape": np.random.random(self.in_shape).astype("float32"),
         }
 
-    def skip_cinn(self):
-        self.enable_cinn = True
+    def if_enable_cinn(self):
+        pass
 
     def test_check_output(self):
         if str(self.dtype) in {"float16", "uint16"}:
@@ -104,6 +104,9 @@ class TestFlattenFP16Op(TestFlattenOp):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op(TestFlattenOp):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -142,6 +145,9 @@ class TestFlattenFP16Op_1(TestFlattenOp_1):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op_1(TestFlattenOp_1):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -180,6 +186,9 @@ class TestFlattenFP16Op_2(TestFlattenOp_2):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op_2(TestFlattenOp_2):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -218,6 +227,9 @@ class TestFlattenFP16Op_3(TestFlattenOp_3):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op_3(TestFlattenOp_3):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -256,6 +268,9 @@ class TestFlattenFP16Op_4(TestFlattenOp_4):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op_4(TestFlattenOp_4):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -294,6 +309,9 @@ class TestFlattenFP16Op_5(TestFlattenOp_5):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16Op_5(TestFlattenOp_5):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
@@ -305,7 +323,7 @@ class TestFlattenOp_ZeroDim(TestFlattenOp):
         self.stop_axis = -1
         self.new_shape = (1,)
 
-    def skip_cinn(self):
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
     def init_attrs(self):
@@ -363,6 +381,9 @@ class TestFlattenFP16OpSixDims(TestFlattenOpSixDims):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestFlattenBF16OpSixDims(TestFlattenOpSixDims):
+    def if_enable_cinn(self):
+        pass
+
     def init_test_dtype(self):
         self.dtype = "uint16"
 
diff --git a/test/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py
index 40a01aef3f6..ceb152a465f 100644
--- a/test/legacy_test/test_index_select_op.py
+++ b/test/legacy_test/test_index_select_op.py
@@ -19,7 +19,7 @@ from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, core, program_guard
 
 np.random.seed(1024)
 
@@ -102,8 +102,11 @@ class TestIndexSelectFP16OP(TestIndexSelectOp):
 class TestIndexSelectBF16Op(OpTest):
     def setUp(self):
         self.python_api = paddle.index_select
+        self.public_python_api = paddle.index_select
+        self.prim_op_type = "comp"
         self.op_type = "index_select"
         self.init_dtype_type()
+        self.if_skip_cinn()
         index_np = np.random.randint(
             low=0, high=self.x_shape[self.dim], size=self.index_size
         )
@@ -124,6 +127,9 @@ class TestIndexSelectBF16Op(OpTest):
         out = np.reshape(out_list, self.out_shape)
         self.outputs = {'Out': convert_float_to_uint16(out)}
 
+    def if_skip_cinn(self):
+        self.enable_cinn = False
+
     def init_dtype_type(self):
         self.dim = 1
         self.x_type = np.uint16
@@ -132,10 +138,12 @@ class TestIndexSelectBF16Op(OpTest):
         self.index_size = 100
 
     def test_check_output(self):
-        self.check_output()
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X'], 'Out')
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
 
 
 class TestIndexSelectAPI(unittest.TestCase):
diff --git a/test/legacy_test/test_pad_op.py b/test/legacy_test/test_pad_op.py
index a25956cfb06..3cec8719e13 100644
--- a/test/legacy_test/test_pad_op.py
+++ b/test/legacy_test/test_pad_op.py
@@ -100,7 +100,7 @@ def create_test_fp16(parent):
             return np.float16
 
         def test_check_grad_normal(self):
-            self.check_grad(['X'], 'Out')
+            self.check_grad(['X'], 'Out', check_prim=True)
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
     TestPadFp16.__name__ = cls_name
@@ -238,9 +238,12 @@ class TestPadBP16Op(OpTest):
         )
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.outputs = {'Out': convert_float_to_uint16(out)}
-        self.enable_cinn = False
         self.prim_op_type = "prim"
         self.public_python_api = pad_wrapper
+        self.if_enable_cinn()
+
+    def if_enable_cinn(self):
+        pass
 
     def initTestCase(self):
         self.shape = (16, 16)
diff --git a/test/legacy_test/test_roll_op.py b/test/legacy_test/test_roll_op.py
index 1dab474ac26..f491112b6a4 100644
--- a/test/legacy_test/test_roll_op.py
+++ b/test/legacy_test/test_roll_op.py
@@ -53,9 +53,6 @@ class TestRollOp(OpTest):
     def test_check_grad_normal(self):
         self.check_grad(['X'], 'Out', check_prim=True)
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
-
 
 class TestRollOpCase2(TestRollOp):
     def init_dtype_type(self):
diff --git a/test/legacy_test/test_scatter_op.py b/test/legacy_test/test_scatter_op.py
index 2a222c9d96a..df264887c62 100644
--- a/test/legacy_test/test_scatter_op.py
+++ b/test/legacy_test/test_scatter_op.py
@@ -31,6 +31,7 @@ class TestScatterOp(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 50)).astype(target_dtype)
         index_np = np.array([1, 2]).astype("int32")
@@ -44,11 +45,14 @@ class TestScatterOp(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(["X", "Updates"], "Out", check_prim=True)
@@ -67,12 +71,14 @@ class TestScatterFP16Op(TestScatterOp):
 class TestScatterBF16Op(TestScatterOp):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_prim=True)
+            self.check_output_with_place(place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -91,6 +97,7 @@ class TestScatterOp0(OpTest):
         self.python_api = paddle.scatter
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
+        self.if_enable_cinn()
         self._set_dtype()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
@@ -106,11 +113,14 @@ class TestScatterOp0(OpTest):
         self.attrs = {'overwrite': True}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(["X", "Updates"], "Out", check_prim=True)
@@ -129,12 +139,14 @@ class TestScatterFP16Op0(TestScatterOp0):
 class TestScatterBF16Op0(TestScatterOp0):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_prim=True)
+            self.check_output_with_place(place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -154,6 +166,7 @@ class TestScatterOp1(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
         zeros_np = np.zeros([2, 3]).astype(target_dtype)
@@ -171,11 +184,14 @@ class TestScatterOp1(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(["X", "Updates"], "Out", check_prim=True)
@@ -194,12 +210,14 @@ class TestScatterFP16Op1(TestScatterOp1):
 class TestScatterBF16Op1(TestScatterOp1):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_prim=True)
+            self.check_output_with_place(place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -222,6 +240,7 @@ class TestScatterOp2(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
         index_np = np.array([1, 2]).astype("int32")
@@ -235,13 +254,16 @@ class TestScatterOp2(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3, check_prim=True)
+            self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -270,6 +292,8 @@ class TestScatterFP16Op2(TestScatterOp2):
 class TestScatterBF16Op2(TestScatterOp2):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
 
@@ -283,6 +307,7 @@ class TestScatterOp3(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
         zeros_np = np.zeros([2, 3]).astype(target_dtype)
@@ -300,13 +325,16 @@ class TestScatterOp3(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3, check_prim=True)
+            self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -335,6 +363,8 @@ class TestScatterFP16Op3(TestScatterOp3):
 class TestScatterBF16Op3(TestScatterOp3):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
 
@@ -345,6 +375,7 @@ class TestScatterOp4(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
         index_np = np.array([1, 2]).astype("int64")
@@ -358,11 +389,14 @@ class TestScatterOp4(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['X', 'Updates'], 'Out', check_prim=True)
@@ -381,12 +415,14 @@ class TestScatterFP16Op4(TestScatterOp4):
 class TestScatterBF16Op4(TestScatterOp4):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_prim=True)
+            self.check_output_with_place(place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -409,6 +445,7 @@ class TestScatterOp5(OpTest):
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
         self._set_dtype()
+        self.if_enable_cinn()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 3)).astype(target_dtype)
         index_np = np.array([1, 2]).astype("int64")
@@ -422,13 +459,16 @@ class TestScatterOp5(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3, check_prim=True)
+            self.check_output_with_place(place, atol=1e-3)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
@@ -457,6 +497,8 @@ class TestScatterFP16Op5(TestScatterOp5):
 class TestScatterBF16Op5(TestScatterOp5):
     def _set_dtype(self):
         self.dtype = np.uint16
+
+    def if_enable_cinn(self):
         self.enable_cinn = False
 
 
@@ -466,7 +508,7 @@ class TestScatterOp6(OpTest):
         self.python_api = paddle.scatter
         self.public_python_api = paddle.scatter
         self.prim_op_type = "prim"
-        self.enable_cinn = False
+        self.if_enable_cinn()
         self._set_dtype()
         target_dtype = "float16" if self.dtype == np.float16 else "float32"
         ref_np = np.ones((3, 50)).astype(target_dtype)
@@ -481,11 +523,14 @@ class TestScatterOp6(OpTest):
         self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
+    def if_enable_cinn(self):
+        pass
+
     def _set_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(["X", "Updates"], "Out", check_prim=True)
@@ -502,13 +547,16 @@ class TestScatterFP16Op6(TestScatterOp6):
     "core is not complied with CUDA and not support the bfloat16",
 )
 class TestScatterBF16Op6(TestScatterOp6):
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def _set_dtype(self):
         self.dtype = np.uint16
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, check_prim=True)
+            self.check_output_with_place(place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 29446bafbbf..87829f503cc 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -32,7 +32,6 @@ class TestSplitOp(OpTest):
         self.dtype = self.get_dtype()
         axis = 1
         if self.dtype == np.uint16:
-            self.enable_cinn = False
             x = np.random.random((4, 5, 6)).astype(np.float32)
             out = np.split(x, [2, 3], axis)
             self.inputs = {'X': convert_float_to_uint16(x)}
@@ -285,7 +284,7 @@ def create_test_bf16(parent):
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['X'], 'out2')
+            self.check_grad_with_place(place, ['X'], 'out2', check_prim=True)
 
     cls_name = "{}_{}".format(parent.__name__, "BF16Op")
     TestSplitBF16Op.__name__ = cls_name
diff --git a/test/legacy_test/test_squeeze2_op.py b/test/legacy_test/test_squeeze2_op.py
index 8a5c5e74efc..c2bef8aa822 100755
--- a/test/legacy_test/test_squeeze2_op.py
+++ b/test/legacy_test/test_squeeze2_op.py
@@ -16,10 +16,11 @@ import os
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
+from paddle.fluid import core
 from paddle.fluid.framework import Program, program_guard
 
 paddle.enable_static()
@@ -36,19 +37,32 @@ class TestSqueezeOp(OpTest):
             "Out"
         ]  # python out sig is customized output signature.
         self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_dtype()
+        self.if_enable_cinn()
+        x = np.random.random(self.ori_shape).astype("float64")
+        xshape = np.random.random(self.ori_shape).astype("float64")
+        if hasattr(self, "dtype") and self.dtype == np.uint16:
+            x = convert_float_to_uint16(x.astype(np.float32))
+            xshape = convert_float_to_uint16(xshape.astype(np.float32))
+        self.inputs = {"X": x}
         self.init_attrs()
         self.outputs = {
             "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.ori_shape).astype("float64"),
+            "XShape": xshape,
         }
 
+    def if_enable_cinn(self):
+        pass
+
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'], check_prim=True)
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out", check_prim=True)
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def init_test_case(self):
         self.ori_shape = (1, 3, 1, 40)
         self.axes = (0, 2)
@@ -58,6 +72,16 @@ class TestSqueezeOp(OpTest):
         self.attrs = {"axes": self.axes}
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestSqueezeOpBF16OP(TestSqueezeOp):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+
 # Correct: There is mins axis.
 class TestSqueezeOp1(TestSqueezeOp):
     def init_test_case(self):
@@ -66,6 +90,16 @@ class TestSqueezeOp1(TestSqueezeOp):
         self.new_shape = (20, 5)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestSqueezeOp1BF16Op(TestSqueezeOp):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+
 # Correct: No axes input.
 class TestSqueezeOp2(TestSqueezeOp):
     def setUp(self):
@@ -77,19 +111,42 @@ class TestSqueezeOp2(TestSqueezeOp):
             "Out"
         ]  # python out sig is customized output signature.
         self.init_test_case()
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_dtype()
+        self.if_enable_cinn()
+        x = np.random.random(self.ori_shape).astype("float64")
+        xshape = np.random.random(self.ori_shape).astype("float64")
+        if hasattr(self, "dtype") and self.dtype == np.uint16:
+            x = convert_float_to_uint16(x.astype(np.float32))
+            xshape = convert_float_to_uint16(xshape.astype(np.float32))
+        self.inputs = {"X": x}
         self.init_attrs()
         self.outputs = {
             "Out": self.inputs["X"].reshape(self.new_shape),
-            "XShape": np.random.random(self.ori_shape).astype("float64"),
+            "XShape": xshape,
         }
 
+    def if_enable_cinn(self):
+        pass
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
         self.axes = ()
         self.new_shape = (20, 5)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestSqueezeOp2BF16Op(TestSqueezeOp):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+
 # Correct: Just part of axes be squeezed.
 class TestSqueezeOp3(TestSqueezeOp):
     def init_test_case(self):
@@ -98,6 +155,16 @@ class TestSqueezeOp3(TestSqueezeOp):
         self.new_shape = (6, 5, 1, 4)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestSqueezeOp3BF16Op(TestSqueezeOp):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+
 class TestSqueeze2AxesTensor(UnittestBase):
     def init_info(self):
         self.shapes = [[2, 3, 4]]
diff --git a/test/legacy_test/test_stack_op.py b/test/legacy_test/test_stack_op.py
index b6a19615a6e..fea31835120 100644
--- a/test/legacy_test/test_stack_op.py
+++ b/test/legacy_test/test_stack_op.py
@@ -167,7 +167,6 @@ class TestStackBF16Op(OpTest):
         self.initParameters()
         self.op_type = 'stack'
         self.prim_op_type = "comp"
-        self.enable_cinn = False
         self.python_api = paddle.stack
         self.public_python_api = paddle.stack
         self.x = []
@@ -191,8 +190,7 @@ class TestStackBF16Op(OpTest):
         self.check_output(check_prim=True)
 
     def test_check_grad(self):
-        # concat_grad unspport bfloat16 dtype, skip check_prim
-        self.check_grad(self.get_x_names(), 'Y')
+        self.check_grad(self.get_x_names(), 'Y', check_prim=True)
 
 
 class TestStackAPIWithLoDTensorArray(unittest.TestCase):
diff --git a/test/legacy_test/test_tile_op.py b/test/legacy_test/test_tile_op.py
index feca03c5a0c..5267bfa1c58 100644
--- a/test/legacy_test/test_tile_op.py
+++ b/test/legacy_test/test_tile_op.py
@@ -30,15 +30,18 @@ class TestTileOpRank1(OpTest):
         self.op_type = "tile"
         self.python_api = paddle.tile
         self.prim_op_type = "prim"
-        self.enable_cinn = True
         self.public_python_api = paddle.tile
         self.init_data()
+        self.if_enable_cinn()
 
         self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
         self.attrs = {'repeat_times': self.repeat_times}
         output = np.tile(self.inputs['X'], self.repeat_times)
         self.outputs = {'Out': output}
 
+    def if_enable_cinn(self):
+        pass
+
     def init_data(self):
         self.ori_shape = [100]
         self.repeat_times = [2]
@@ -52,24 +55,30 @@ class TestTileOpRank1(OpTest):
 
 class TestTileOpRank_ZeroDim1(TestTileOpRank1):
     def init_data(self):
-        self.enable_cinn = False
         self.ori_shape = []
         self.repeat_times = []
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
 
 class TestTileOpRank_ZeroDim2(TestTileOpRank1):
     def init_data(self):
-        self.enable_cinn = False
         self.ori_shape = []
         self.repeat_times = [2]
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
 
 class TestTileOpRank_ZeroDim3(TestTileOpRank1):
     def init_data(self):
-        self.enable_cinn = False
         self.ori_shape = []
         self.repeat_times = [2, 3]
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
 
 # with dimension expanding
 class TestTileOpRank2Expanding(TestTileOpRank1):
@@ -240,7 +249,6 @@ class TestTileBF16OP(OpTest):
         self.__class__.op_type = self.op_type
         self.python_api = paddle.tile
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         self.public_python_api = paddle.tile
         self.init_data()
         x = np.random.uniform(10, size=self.ori_shape).astype(np.float32)
diff --git a/test/legacy_test/test_top_k_v2_op.py b/test/legacy_test/test_top_k_v2_op.py
index 5612703968d..872a52e7ccc 100644
--- a/test/legacy_test/test_top_k_v2_op.py
+++ b/test/legacy_test/test_top_k_v2_op.py
@@ -15,7 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
 from paddle.fluid import core
@@ -51,6 +55,7 @@ class TestTopkOp(OpTest):
         self.dtype = np.float64
         self.input_data = np.random.rand(10, 20)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -58,6 +63,9 @@ class TestTopkOp(OpTest):
         )
         self.outputs = {'Out': output, 'Indices': indices}
 
+    def if_enable_cinn(self):
+        pass
+
     def test_check_output(self):
         self.check_output()
 
@@ -115,6 +123,7 @@ class TestTopkOp4(TestTopkOp):
         self.dtype = np.float64
         self.input_data = np.random.rand(10, 10, 5)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -137,6 +146,7 @@ class TestTopkOp5(TestTopkOp):
         self.dtype = np.float64
         self.input_data = np.random.rand(10, 10, 5)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -159,6 +169,7 @@ class TestTopkOp6(TestTopkOp):
         self.dtype = np.float32
         self.input_data = np.random.rand(10, 10, 5)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -181,6 +192,7 @@ class TestTopkOp7(TestTopkOp):
         self.dtype = np.float16
         self.input_data = np.random.rand(10, 20, 10)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -198,6 +210,7 @@ class TestTopkFP16Op(TestTopkOp):
         self.prim_op_type = "prim"
         self.input_data = np.random.rand(10, 20).astype(self.dtype)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': self.input_data}
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
@@ -218,9 +231,11 @@ class TestTopkBF16Op(TestTopkOp):
         self.public_python_api = paddle.topk
         self.dtype = np.uint16
         self.prim_op_type = "prim"
-        self.input_data = np.random.rand(10, 20).astype(np.float32)
+        self.input_data = np.random.random([10, 20]).astype(np.float32)
         self.init_args()
+        self.if_enable_cinn()
         self.inputs = {'X': convert_float_to_uint16(self.input_data)}
+        self.input_data = convert_uint16_to_float(self.inputs['X'])
         self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
         output, indices = numpy_topk(
             self.input_data, axis=self.axis, k=self.k, largest=self.largest
@@ -230,13 +245,16 @@ class TestTopkBF16Op(TestTopkOp):
             'Indices': indices,
         }
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True)
+        self.check_output_with_place(place)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, {'X'}, 'Out', check_eager=True)
+        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
 
 
 class TestTopKAPI(unittest.TestCase):
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index 3865476f529..5bbc458799f 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -244,7 +244,7 @@ class TestAutoTuneTransposeBF16Op(OpTest):
         self.python_api = paddle.transpose
         self.public_python_api = paddle.transpose
         self.prim_op_type = "prim"
-        self.enable_cinn = False
+        self.if_enable_cinn()
         x = np.random.random(self.shape).astype("float32")
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.attrs = {
@@ -258,6 +258,9 @@ class TestAutoTuneTransposeBF16Op(OpTest):
             'Out': self.inputs['X'].transpose(self.axis),
         }
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def initTestCase(self):
         fluid.core.set_autotune_range(0, 3)
         fluid.core.update_autotune_status()
@@ -283,7 +286,7 @@ class TestTransposeFP16Op(OpTest):
         self.initTestCase()
         self.dtype = np.float16
         self.prim_op_type = "prim"
-        self.enable_cinn = False
+        self.if_enable_cinn()
         self.python_api = paddle.transpose
         self.public_python_api = paddle.transpose
         x = np.random.random(self.shape).astype(self.dtype)
@@ -298,6 +301,9 @@ class TestTransposeFP16Op(OpTest):
             'Out': self.inputs['X'].transpose(self.axis),
         }
 
+    def if_enable_cinn(self):
+        pass
+
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = False
@@ -323,6 +329,7 @@ class TestTransposeBF16Op(OpTest):
         self.python_api = paddle.transpose
         self.public_python_api = paddle.transpose
         x = np.random.random(self.shape).astype("float32")
+        self.if_enable_cinn()
 
         self.inputs = {'X': convert_float_to_uint16(x)}
         self.attrs = {
@@ -336,6 +343,9 @@ class TestTransposeBF16Op(OpTest):
             'Out': self.inputs['X'].transpose(self.axis),
         }
 
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def init_op_type(self):
         self.op_type = "transpose2"
         self.use_mkldnn = False
diff --git a/test/legacy_test/test_unsqueeze2_op.py b/test/legacy_test/test_unsqueeze2_op.py
index b7b4c185e97..2ba8d1204b9 100755
--- a/test/legacy_test/test_unsqueeze2_op.py
+++ b/test/legacy_test/test_unsqueeze2_op.py
@@ -37,12 +37,16 @@ class TestUnsqueezeOp(OpTest):
             "XShape": np.random.random(self.ori_shape).astype("float64"),
         }
         self.prim_op_type = "comp"
+        self.if_enable_cinn()
+
+    def if_enable_cinn(self):
+        pass
 
     def test_check_output(self):
         self.check_output(no_check_set=["XShape"], check_prim=True)
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", check_prim=True)
 
     def init_test_case(self):
         self.ori_shape = (3, 40)
@@ -90,7 +94,6 @@ class TestUnsqueezeOp_ZeroDim1(TestUnsqueezeOp):
         self.ori_shape = ()
         self.axes = (-1,)
         self.new_shape = 1
-        self.enable_cinn = False
 
 
 class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp):
@@ -98,7 +101,6 @@ class TestUnsqueezeOp_ZeroDim2(TestUnsqueezeOp):
         self.ori_shape = ()
         self.axes = (-1, 1)
         self.new_shape = (1, 1)
-        self.enable_cinn = False
 
 
 class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp):
@@ -106,7 +108,6 @@ class TestUnsqueezeOp_ZeroDim3(TestUnsqueezeOp):
         self.ori_shape = ()
         self.axes = (0, 1, 2)
         self.new_shape = (1, 1, 1)
-        self.enable_cinn = False
 
 
 # axes is a list(with tensor)
-- 
GitLab