From 4c5cfdea1b4fee23c0d3dfdcd93d2e0731790727 Mon Sep 17 00:00:00 2001 From: liu zhengxi <380185688@qq.com> Date: Fri, 18 Sep 2020 16:18:49 +0800 Subject: [PATCH] fix paddle.nn.Transformer api (#27391) --- .../tests/unittests/test_transformer_api.py | 135 ++++++++++++++++++ python/paddle/nn/layer/transformer.py | 102 ++++++++++--- 2 files changed, 217 insertions(+), 20 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py index 5fea9f69a18..bd76edc9d8c 100644 --- a/python/paddle/fluid/tests/unittests/test_transformer_api.py +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase): trans_output = transformer(src, tgt, src_mask, tgt_mask, memory_mask) + def test_transformer_attr_1(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + + # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8 + with fluid.dygraph.guard(fluid.CPUPlace()): + transformer = Transformer( + d_model, + n_head, + dim_feedforward=dim_feedforward, + dropout=dropout, + weight_attr=[None], + bias_attr=[False]) + src = paddle.to_variable( + np.random.rand(batch_size, source_length, d_model).astype( + "float32")) + tgt = paddle.to_variable( + np.random.rand(batch_size, target_length, d_model).astype( + "float32")) + src_mask = np.zeros((batch_size, n_head, source_length, + source_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + src_mask = paddle.to_variable(src_mask) + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + tgt_mask, memory_mask = paddle.to_variable( + tgt_mask), paddle.to_variable(memory_mask) + trans_output = transformer(src, tgt, src_mask, tgt_mask, + memory_mask) + + def test_transformer_attr_2(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + + # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8 + with fluid.dygraph.guard(fluid.CPUPlace()): + transformer = Transformer( + d_model, + n_head, + dim_feedforward=dim_feedforward, + dropout=dropout, + weight_attr=[None, None], + bias_attr=[False, False]) + src = paddle.to_variable( + np.random.rand(batch_size, source_length, d_model).astype( + "float32")) + tgt = paddle.to_variable( + np.random.rand(batch_size, target_length, d_model).astype( + "float32")) + src_mask = np.zeros((batch_size, n_head, source_length, + source_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + src_mask = paddle.to_variable(src_mask) + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + tgt_mask, memory_mask = paddle.to_variable( + tgt_mask), paddle.to_variable(memory_mask) + trans_output = transformer(src, tgt, src_mask, tgt_mask, + memory_mask) + + def test_transformer_attr_3(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + + # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8 + with fluid.dygraph.guard(fluid.CPUPlace()): + transformer = Transformer( + d_model, + n_head, + dim_feedforward=dim_feedforward, + dropout=dropout, + weight_attr=[None, None, None], + bias_attr=[False, False, True]) + src = paddle.to_variable( + np.random.rand(batch_size, source_length, d_model).astype( + "float32")) + tgt = paddle.to_variable( + np.random.rand(batch_size, target_length, d_model).astype( + "float32")) + src_mask = np.zeros((batch_size, n_head, source_length, + source_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + src_mask = paddle.to_variable(src_mask) + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + tgt_mask, memory_mask = paddle.to_variable( + tgt_mask), paddle.to_variable(memory_mask) + trans_output = transformer(src, tgt, src_mask, tgt_mask, + memory_mask) + + def test_transformer_attr_boolean(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + + # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8 + with fluid.dygraph.guard(fluid.CPUPlace()): + transformer = Transformer( + d_model, + n_head, + dim_feedforward=dim_feedforward, + dropout=dropout, + bias_attr=False) + src = paddle.to_variable( + np.random.rand(batch_size, source_length, d_model).astype( + "float32")) + tgt = paddle.to_variable( + np.random.rand(batch_size, target_length, d_model).astype( + "float32")) + src_mask = np.zeros((batch_size, n_head, source_length, + source_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + src_mask = paddle.to_variable(src_mask) + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + tgt_mask, memory_mask = paddle.to_variable( + tgt_mask), paddle.to_variable(memory_mask) + trans_output = transformer(src, tgt, src_mask, tgt_mask, + memory_mask) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 63069e83952..4b199d5816c 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n): if isinstance(param_attr, (list, tuple)): assert len(param_attr) == n, ( "length of param_attr should be %d when it is a list/tuple" % n) - param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] + param_attrs = [] + for attr in param_attr: + if isinstance(attr, bool): + if attr: + param_attrs.append(ParamAttr._to_attr(None)) + else: + param_attrs.append(False) + else: + param_attrs.append(ParamAttr._to_attr(attr)) + # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] + elif isinstance(param_attr, bool): + param_attrs = [] + if param_attr: + param_attrs = [ParamAttr._to_attr(None) for i in range(n)] + else: + param_attrs = [False] * n else: param_attrs = [] attr = ParamAttr._to_attr(param_attr) @@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer): Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . - bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. + bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property. If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. @@ -986,22 +1001,31 @@ class Transformer(Layer): Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. - If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for - self attention, `weight_attr[1]` would be used as `weight_attr` for - cross attention, and `weight_attr[2]` would be used as `weight_attr` - for linear in FFN. Otherwise, the three sub-layers all uses it as - `weight_attr` to create parameters. Default: None, which means the - default weight parameter property is used. See usage for details + If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, + `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` + would be used as `weight_attr` for cross attention of `TransformerDecoder`, + and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. + If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention + and cross attntion and `weight_attr[1]` would be used as `weight_attr` for + linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` + for self attention, cross attention and linear in FFN. Otherwise, + the three sub-layers all uses it as `weight_attr` to create parameters. + Default: None, which means the default weight parameter property is used. + See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. - If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for - self attention, `bias_attr[1]` would be used as `bias_attr` for - cross attention, and `bias_attr[2]` would be used as `bias_attr` - for linear in FFN. Otherwise, the three sub-layers all uses it as - `bias_attr` to create parameters. The `False` value means the - corresponding layer would not have trainable bias parameter. See - usage for details in :code:`ParamAttr` . Default: None,which means - the default bias parameter property is used. + If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, + `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` + would be used as `bias_attr` for cross attention of `TransformerDecoder`, + and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. + If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention + and cross attntion and `bias_attr[1]` would be used as `bias_attr` for + linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` + for self attention, cross attention and linear in FFN. Otherwise, + the three sub-layers all uses it as `bias_attr` to create parameters. + The `False` value means the corresponding layer would not have trainable + bias parameter. See usage for details in :code:`ParamAttr` . + Default: None,which means the default bias parameter property is used. custom_encoder (Layer): If custom encoder is provided, use it as the encoder. Default None custom_decoder (Layer): If custom decoder is provided, use it as the decoder. @@ -1049,13 +1073,51 @@ class Transformer(Layer): custom_decoder=None): super(Transformer, self).__init__() + if isinstance(bias_attr, (list, tuple)): + if len(bias_attr) == 1: + encoder_bias_attr = [bias_attr[0]] * 2 + decoder_bias_attr = [bias_attr[0]] * 3 + elif len(bias_attr) == 2: + encoder_bias_attr = bias_attr + decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]] + elif len(bias_attr) == 3: + encoder_bias_attr = [bias_attr[0], bias_attr[-1]] + decoder_bias_attr = bias_attr + else: + assert False, ( + "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple" + ) + else: + encoder_bias_attr = bias_attr + decoder_bias_attr = bias_attr + + if isinstance(weight_attr, (list, tuple)): + if len(weight_attr) == 1: + encoder_weight_attr = [weight_attr[0]] * 2 + decoder_weight_attr = [weight_attr[0]] * 3 + elif len(weight_attr) == 2: + encoder_weight_attr = weight_attr + decoder_weight_attr = [ + weight_attr[0], weight_attr[0], weight_attr[-1] + ] + elif len(weight_attr) == 3: + encoder_weight_attr = [weight_attr[0], weight_attr[-1]] + decoder_weight_attr = weight_attr + else: + assert False, ( + "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple" + ) + else: + encoder_weight_attr = weight_attr + decoder_weight_attr = weight_attr + if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before, weight_attr, - bias_attr) + attn_dropout, act_dropout, normalize_before, + encoder_weight_attr, encoder_bias_attr) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) @@ -1065,8 +1127,8 @@ class Transformer(Layer): else: decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before, weight_attr, - bias_attr) + attn_dropout, act_dropout, normalize_before, + decoder_weight_attr, decoder_bias_attr) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) -- GitLab