From 7696ae02de215a9dd1d8ec17778485e832e53132 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 4 Jan 2023 10:21:09 +0800
Subject: [PATCH] [Cherry-pick] add condition of skipif  (#49407)

* resolve conflict

* fix format error
---
 .../unittests/test_fused_gate_attention_op.py | 188 ++++++++++++------
 1 file changed, 126 insertions(+), 62 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index f911d614ee4..18c28144d6e 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -30,10 +30,10 @@ from paddle.fluid.framework import default_main_program
 from paddle.fluid import core
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "Paddle is not compiled with CUDA")
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
+)
 class TestFusedGateAttentionOp(OpTest):
-
     def setUp(self):
         self.__class__.op_type = "fused_gate_attention"
         # use autograd to check grad in this unittest.
@@ -57,7 +57,6 @@ class TestFusedGateAttentionOp(OpTest):
         self.bias_attr = True
 
     def generate_input_data(self):
-
         def _random(shape):
             if self.dtype == "bfloat16":
                 data = np.random.random(shape).astype("float32")
@@ -67,7 +66,8 @@ class TestFusedGateAttentionOp(OpTest):
 
         np.random.seed(123)
         self.query = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
         self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim))
         self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
         self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
@@ -80,15 +80,18 @@ class TestFusedGateAttentionOp(OpTest):
             self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
         else:
             self.key = _random(
-                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim)
+            )
             self.qkv_weight = None
 
         self.attn_mask = _random(
-            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+            (self.batch_size, self.msa_len, 1, 1, self.m_size)
+        )
 
         if self.bias_attr:
             self.nonbatched_bias = _random(
-                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size)
+            )
 
         if self.has_gating:
             self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim))
@@ -98,12 +101,17 @@ class TestFusedGateAttentionOp(OpTest):
         self.output_b = _random((self.out_dim))
 
         self.dout = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
 
     def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out):
         outputs = [
-            softmax_out, fmha_out, gate_out if self.has_gating else None, out,
-            query.grad, None if self.merge_qkv else key.grad
+            softmax_out,
+            fmha_out,
+            gate_out if self.has_gating else None,
+            out,
+            query.grad,
+            None if self.merge_qkv else key.grad,
         ]
         return outputs
 
@@ -111,14 +119,17 @@ class TestFusedGateAttentionOp(OpTest):
         paddle.disable_static(place=paddle.CUDAPlace(0))
 
         query = paddle.to_tensor(self.query, stop_gradient=False)
-        key = query if self.merge_qkv else paddle.to_tensor(self.key,
-                                                            stop_gradient=False)
+        key = (
+            query
+            if self.merge_qkv
+            else paddle.to_tensor(self.key, stop_gradient=False)
+        )
         q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
         k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
         v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
         src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
 
-        c = self.head_dim**(-0.5)
+        c = self.head_dim ** (-0.5)
         # [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim]
         #   -> [batch_size, msa_len, res_len, num_heads, head_dim]
         q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
@@ -136,8 +147,9 @@ class TestFusedGateAttentionOp(OpTest):
         #   -> [batch_size, msa_len, num_heads, res_len, m_size]
         logits = logits + src_mask
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
-                                               stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False
+            )
             # [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size]
             #   -> [batch_size, msa_len, num_heads, res_len, m_size]
             logits = logits + nonbatched_bias
@@ -159,14 +171,22 @@ class TestFusedGateAttentionOp(OpTest):
             # gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
             #                             gating_w) + gating_b
             gating_w_2d = paddle.reshape(
-                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim])
+                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]
+            )
             gate_values_4d = paddle.matmul(query, gating_w_2d)
-            gate_values = paddle.reshape(
-                gate_values_4d,
-                shape=[
-                    self.batch_size, self.msa_len, self.res_len, self.num_heads,
-                    self.head_dim
-                ]) + gating_b
+            gate_values = (
+                paddle.reshape(
+                    gate_values_4d,
+                    shape=[
+                        self.batch_size,
+                        self.msa_len,
+                        self.res_len,
+                        self.num_heads,
+                        self.head_dim,
+                    ],
+                )
+                + gating_b
+            )
             gate_values = nn.functional.sigmoid(gate_values)
             gate_out = fmha_out * gate_values
         else:
@@ -183,20 +203,32 @@ class TestFusedGateAttentionOp(OpTest):
             gate_out,
             shape=[
                 self.batch_size * self.msa_len * self.res_len,
-                self.num_heads * self.head_dim
-            ])
+                self.num_heads * self.head_dim,
+            ],
+        )
         output_w_2d = paddle.reshape(
-            output_w, shape=[self.num_heads * self.head_dim, self.out_dim])
+            output_w, shape=[self.num_heads * self.head_dim, self.out_dim]
+        )
         out_2d = paddle.matmul(gate_out_2d, output_w_2d)
-        out = paddle.reshape(
-            out_2d,
-            shape=[self.batch_size, self.msa_len, self.res_len, self.out_dim
-                   ]) + output_b
-
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
-                                 retain_graph=True)
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
-                                    out)
+        out = (
+            paddle.reshape(
+                out_2d,
+                shape=[
+                    self.batch_size,
+                    self.msa_len,
+                    self.res_len,
+                    self.out_dim,
+                ],
+            )
+            + output_b
+        )
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
 
     def get_fused_gate_attention_out(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
@@ -218,8 +250,9 @@ class TestFusedGateAttentionOp(OpTest):
         src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
 
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
-                                               stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False
+            )
         else:
             nonbatched_bias = None
         if self.has_gating:
@@ -232,18 +265,42 @@ class TestFusedGateAttentionOp(OpTest):
         output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
         output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
 
-        _, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention(
-            query, key, q_weight, k_weight, v_weight, qkv_weight,
-            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
-            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
-
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
-                                 retain_graph=True)
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
-                                    out)
+        (
+            _,
+            _,
+            _,
+            _,
+            softmax_out,
+            fmha_out,
+            gate_out,
+            out,
+        ) = _legacy_C_ops.fused_gate_attention(
+            query,
+            key,
+            q_weight,
+            k_weight,
+            v_weight,
+            qkv_weight,
+            nonbatched_bias,
+            src_mask,
+            gating_w,
+            gating_b,
+            output_w,
+            output_b,
+            'has_gating',
+            self.has_gating,
+            'merge_qkv',
+            self.merge_qkv,
+        )
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
 
     def check(self, ref, out, atol, rtol, check_equal, name):
-
         def _convert(value):
             if self.dtype == "bfloat16":
                 return convert_uint16_to_float(value)
@@ -252,19 +309,25 @@ class TestFusedGateAttentionOp(OpTest):
         if check_equal:
             self.assertTrue(
                 np.equal(_convert(ref), _convert(out)).all(),
-                "Checking < {} > failed!".format(name))
+                "Checking < {} > failed!".format(name),
+            )
         else:
             np.testing.assert_allclose(
                 _convert(ref),
                 _convert(out),
                 atol=atol,
                 rtol=rtol,
-                err_msg="Checking < {} > failed!".format(name))
+                err_msg="Checking < {} > failed!".format(name),
+            )
 
     def check_output_and_grad(self, atol, rtol):
         output_names = [
-            "softmax_out", "fmha_out", "gate_out", "out", "query_grad",
-            "key_grad"
+            "softmax_out",
+            "fmha_out",
+            "gate_out",
+            "out",
+            "query_grad",
+            "key_grad",
         ]
         outputs_ref = self.get_reference_out()
         outputs_fused = self.get_fused_gate_attention_out()
@@ -280,22 +343,26 @@ class TestFusedGateAttentionOp(OpTest):
                 # that in fused ops, check_equal is set to False and we use allclose
                 # to check the correctness.
                 check_equal = False
-                self.check(ref_res.numpy(), fused_res.numpy(), atol, rtol,
-                           check_equal, output_names[i])
+                self.check(
+                    ref_res.numpy(),
+                    fused_res.numpy(),
+                    atol,
+                    rtol,
+                    check_equal,
+                    output_names[i],
+                )
 
     def test_output_and_grad(self):
         self.check_output_and_grad(atol=1e-5, rtol=1e-6)
 
 
 class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.batch_size = 2
 
 
 class TestSeparatedQKVCase(TestFusedGateAttentionOp):
-
     def config(self):
         self.dtype = "float32"
         self.has_gating = False
@@ -312,7 +379,6 @@ class TestSeparatedQKVCase(TestFusedGateAttentionOp):
 
 
 class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.has_gating = False
@@ -320,7 +386,6 @@ class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
 
 
 class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.dtype = "float16"
@@ -332,18 +397,18 @@ class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
 
 
 class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
-
     def config(self):
         super().config()
         self.batch_size = 2
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11000
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.dtype = "bfloat16"
@@ -353,7 +418,6 @@ class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
 
 
 class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
-
     def config(self):
         super().config()
         self.batch_size = 2
-- 
GitLab