From 4c5cfdea1b4fee23c0d3dfdcd93d2e0731790727 Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Fri, 18 Sep 2020 16:18:49 +0800
Subject: [PATCH] fix paddle.nn.Transformer api (#27391)

---
 .../tests/unittests/test_transformer_api.py   | 135 ++++++++++++++++++
 python/paddle/nn/layer/transformer.py         | 102 ++++++++++---
 2 files changed, 217 insertions(+), 20 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 5fea9f69a18..bd76edc9d8c 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
+    def test_transformer_attr_1(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None],
+                bias_attr=[False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_2(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None],
+                bias_attr=[False, False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_3(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None, None],
+                bias_attr=[False, False, True])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_boolean(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                bias_attr=False)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 63069e83952..4b199d5816c 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
     if isinstance(param_attr, (list, tuple)):
         assert len(param_attr) == n, (
             "length of param_attr should be %d when it is a list/tuple" % n)
-        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        param_attrs = []
+        for attr in param_attr:
+            if isinstance(attr, bool):
+                if attr:
+                    param_attrs.append(ParamAttr._to_attr(None))
+                else:
+                    param_attrs.append(False)
+            else:
+                param_attrs.append(ParamAttr._to_attr(attr))
+        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    elif isinstance(param_attr, bool):
+        param_attrs = []
+        if param_attr:
+            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
+        else:
+            param_attrs = [False] * n
     else:
         param_attrs = []
         attr = ParamAttr._to_attr(param_attr)
@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
@@ -986,22 +1001,31 @@ class Transformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
-            self attention, `weight_attr[1]` would be used as `weight_attr` for
-            cross attention, and `weight_attr[2]` would be used as `weight_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `weight_attr` to create parameters. Default: None, which means the
-            default weight parameter property is used. See usage for details
+            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `weight_attr` to create parameters. 
+            Default: None, which means the default weight parameter property is used. 
+            See usage for details
             in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
-            self attention, `bias_attr[1]` would be used as `bias_attr` for
-            cross attention, and `bias_attr[2]` would be used as `bias_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. The `False` value means the
-            corresponding layer would not have trainable bias parameter. See
-            usage for details in :code:`ParamAttr` . Default: None,which means
-            the default bias parameter property is used.
+            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `bias_attr` to create parameters. 
+            The `False` value means the corresponding layer would not have trainable 
+            bias parameter. See usage for details in :code:`ParamAttr` . 
+            Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
             Default None
         custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
@@ -1049,13 +1073,51 @@ class Transformer(Layer):
                  custom_decoder=None):
         super(Transformer, self).__init__()
 
+        if isinstance(bias_attr, (list, tuple)):
+            if len(bias_attr) == 1:
+                encoder_bias_attr = [bias_attr[0]] * 2
+                decoder_bias_attr = [bias_attr[0]] * 3
+            elif len(bias_attr) == 2:
+                encoder_bias_attr = bias_attr
+                decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
+            elif len(bias_attr) == 3:
+                encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
+                decoder_bias_attr = bias_attr
+            else:
+                assert False, (
+                    "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_bias_attr = bias_attr
+            decoder_bias_attr = bias_attr
+
+        if isinstance(weight_attr, (list, tuple)):
+            if len(weight_attr) == 1:
+                encoder_weight_attr = [weight_attr[0]] * 2
+                decoder_weight_attr = [weight_attr[0]] * 3
+            elif len(weight_attr) == 2:
+                encoder_weight_attr = weight_attr
+                decoder_weight_attr = [
+                    weight_attr[0], weight_attr[0], weight_attr[-1]
+                ]
+            elif len(weight_attr) == 3:
+                encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
+                decoder_weight_attr = weight_attr
+            else:
+                assert False, (
+                    "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_weight_attr = weight_attr
+            decoder_weight_attr = weight_attr
+
         if custom_encoder is not None:
             self.encoder = custom_encoder
         else:
             encoder_layer = TransformerEncoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                encoder_weight_attr, encoder_bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                               encoder_norm)
@@ -1065,8 +1127,8 @@ class Transformer(Layer):
         else:
             decoder_layer = TransformerDecoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                decoder_weight_attr, decoder_bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                               decoder_norm)
-- 
GitLab