From edacb6293ce539c5e7cc297114e4e40d0478b28f Mon Sep 17 00:00:00 2001 From: xiemoyuan <71377852+xiemoyuan@users.noreply.github.com> Date: Tue, 23 Feb 2021 14:47:17 +0800 Subject: [PATCH] Optimization of Transformer API (#30957) * Support 'bool' and 'int' for attention mask. * Update docs. * Add unittest for Transformer. * fix bugs. --- .../tests/unittests/test_transformer_api.py | 113 +++++++----- python/paddle/nn/layer/transformer.py | 168 +++++++++++++----- 2 files changed, 186 insertions(+), 95 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py index 194503b8ad2..587cedc6aad 100644 --- a/python/paddle/fluid/tests/unittests/test_transformer_api.py +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -51,6 +51,7 @@ def generate_query_key_value_cache(self_attention, num_heads, query_length, embed_dim, + attn_mask_type, key_length=None, value_length=None, kdim=None, @@ -58,8 +59,14 @@ def generate_query_key_value_cache(self_attention, cache=None): query = np.random.rand(batch_size, query_length, embed_dim).astype("float32") - attn_mask = np.zeros((batch_size, num_heads, query_length, key_length)) - attn_mask[0][0][0][0] = -1e9 + attn_mask = np.ones( + (batch_size, num_heads, query_length, key_length), dtype=attn_mask_type) + if attn_mask_type == 'int64': + attn_mask = np.tril(attn_mask) + elif attn_mask_type == 'float64': + attn_mask = (np.tril(attn_mask) - 1.0) * 1e9 + else: + raise ValueError("'attn_mask_type' should be 'int64' or 'float64'.") head_dim = embed_dim // num_heads if self_attention: @@ -115,6 +122,10 @@ def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn): k = k.transpose([0, 1, 3, 2]) qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64)) if attn_mask is not None: + if attn_mask.dtype.name == 'int64': + attn_mask = (attn_mask.astype(qkt.dtype) - 1.0) * 1e9 + else: + attn_mask = attn_mask.astype(qkt.dtype) qkt += attn_mask weight = softmax(qkt) attn_heads = batch_matmul(weight, v) @@ -219,53 +230,57 @@ class TestTransformer(unittest.TestCase): # generate params for multi_head_attention batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params( "attn", self_attention) - query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( - self_attention, batch_size, num_heads, query_length, - embed_dim, key_length, value_length, kdim, vdim, cache) - if cache and self_attention: - attn_mask = np.concatenate((attn_mask, attn_mask), axis=3) - need_weight, param_attr, bias_attr = False, None, None - # call paddle's function - multi_head_attn = MultiHeadAttention( - embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight, - param_attr, bias_attr) - # construct cache object - cache_obj = None - if cache_dict: - if 'k' and 'v' in cache_dict: - cache_obj = multi_head_attn.Cache( - paddle.to_tensor(cache_dict['k']), - paddle.to_tensor(cache_dict['v'])) - elif 'static_k' and 'static_v' in cache_dict: - cache_obj = multi_head_attn.StaticCache( - paddle.to_tensor(cache_dict['static_k']), - paddle.to_tensor(cache_dict['static_v'])) - if attn_mask is not None: - attn_output = multi_head_attn( - paddle.to_tensor(query), - paddle.to_tensor(key), - paddle.to_tensor(value), - paddle.to_tensor(attn_mask), cache_obj) - else: - attn_output = multi_head_attn( - paddle.to_tensor(query), - paddle.to_tensor(key), - paddle.to_tensor(value), attn_mask, cache_obj) - attn_output = attn_output[0] if cache_dict else attn_output - - # implementation by numpy - # compute q, k, v - q, k, v, _ = prepare_qkv(query, key, value, num_heads, - embed_dim, self_attention, - multi_head_attn, cache_dict) - # scale dot product attention - attn_heads = scaled_dot_product_attention( - q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn) - out_proj_weight = multi_head_attn.out_proj.weight.numpy() - reference = fc(attn_heads, out_proj_weight) - - np.testing.assert_allclose( - attn_output.numpy(), reference, atol=1e-6) + for attn_mask_type in ['int64', 'float64']: + query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( + self_attention, batch_size, num_heads, query_length, + embed_dim, attn_mask_type, key_length, value_length, + kdim, vdim, cache) + if cache and self_attention: + attn_mask = np.concatenate( + (attn_mask, attn_mask), axis=3) + need_weight, param_attr, bias_attr = False, None, None + # call paddle's function + multi_head_attn = MultiHeadAttention( + embed_dim, num_heads, attn_dropout, kdim, vdim, + need_weight, param_attr, bias_attr) + # construct cache object + cache_obj = None + if cache_dict: + if 'k' and 'v' in cache_dict: + cache_obj = multi_head_attn.Cache( + paddle.to_tensor(cache_dict['k']), + paddle.to_tensor(cache_dict['v'])) + elif 'static_k' and 'static_v' in cache_dict: + cache_obj = multi_head_attn.StaticCache( + paddle.to_tensor(cache_dict['static_k']), + paddle.to_tensor(cache_dict['static_v'])) + if attn_mask is not None: + attn_output = multi_head_attn( + paddle.to_tensor(query), + paddle.to_tensor(key), + paddle.to_tensor(value), + paddle.to_tensor(attn_mask), cache_obj) + else: + attn_output = multi_head_attn( + paddle.to_tensor(query), + paddle.to_tensor(key), + paddle.to_tensor(value), attn_mask, cache_obj) + attn_output = attn_output[0] if cache_dict else attn_output + + # implementation by numpy + # compute q, k, v + q, k, v, _ = prepare_qkv(query, key, value, num_heads, + embed_dim, self_attention, + multi_head_attn, cache_dict) + # scale dot product attention + attn_heads = scaled_dot_product_attention( + q, k, v, embed_dim // num_heads, attn_mask, + multi_head_attn) + out_proj_weight = multi_head_attn.out_proj.weight.numpy() + reference = fc(attn_heads, out_proj_weight) + + np.testing.assert_allclose( + attn_output.numpy(), reference, atol=1e-6) multihead_attention_test_helper(True, True) multihead_attention_test_helper(True, False) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 75f998b037e..5aded4949e2 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -34,6 +34,7 @@ from ... import tensor from ...fluid import layers from ...fluid.dygraph import Layer, LayerList from ...fluid.param_attr import ParamAttr +from ...fluid.data_feeder import convert_dtype def _convert_param_attr_to_list(param_attr, n): @@ -82,6 +83,35 @@ def _convert_param_attr_to_list(param_attr, n): return param_attrs +def _convert_attention_mask(attn_mask, dtype): + """ + Convert the attention mask to the target dtype we expect. + + Parameters: + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + dtype (VarType): The target type of `attn_mask` we expect. + + Returns: + Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. + """ + if attn_mask is not None and attn_mask.dtype != dtype: + attn_mask_dtype = convert_dtype(attn_mask.dtype) + if attn_mask_dtype == 'bool' or 'int' in attn_mask_dtype: + attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9 + else: + attn_mask = paddle.cast(attn_mask, dtype) + return attn_mask + + class MultiHeadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and @@ -105,7 +135,7 @@ class MultiHeadAttention(Layer): weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :code:`ParamAttr` . - bias_attr (ParamAttr, optional): To specify the bias parameter property. + bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. If it is set to False, this layer will not have trainable bias parameter. See usage for details in :code:`ParamAttr` . @@ -331,11 +361,13 @@ class MultiHeadAttention(Layer): attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape - broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results @@ -374,7 +406,8 @@ class MultiHeadAttention(Layer): product = layers.matmul( x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: - # TODO(guosheng): support bool mask + # Support bool or int mask + attn_mask = _convert_attention_mask(attn_mask, product.dtype) product = product + attn_mask weights = F.softmax(product) if self.dropout: @@ -509,11 +542,13 @@ class TransformerEncoderLayer(Layer): src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape - broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. See `TransformerEncoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default @@ -528,10 +563,12 @@ class TransformerEncoderLayer(Layer): incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. """ + src_mask = _convert_attention_mask(src_mask, src.dtype) + residual = src if self.normalize_before: src = self.norm1(src) - # TODO(guosheng): Add cache for encoder for the usage like UniLM + # Add cache for encoder for the usage like UniLM if cache is None: src = self.self_attn(src, src, src, src_mask) else: @@ -622,11 +659,13 @@ class TransformerEncoder(Layer): src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape - broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. cache (list, optional): It is a list, and each element in the list is `incremental_cache` produced by `TransformerEncoderLayer.gen_cache`. See `TransformerEncoder.gen_cache` for more details. It is only @@ -641,6 +680,8 @@ class TransformerEncoder(Layer): See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ + src_mask = _convert_attention_mask(src_mask, src.dtype) + output = src new_caches = [] for i, mod in enumerate(self.layers): @@ -808,18 +849,23 @@ class TransformerDecoderLayer(Layer): tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted - to `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + to `[batch_size, n_head, target_length, target_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, - usually the paddings. It is a tensor with shape broadcasted to - `[batch_size, n_head, target_length, source_length]`, where the - unwanted positions have `-INF` values and the others have 0 values. - The data type should be float32 or float64. It can be None when - nothing wanted or needed to be prevented attention to. Default None + usually the paddings. It is a tensor with shape broadcasted to + `[batch_size, n_head, target_length, source_length]`. When the + data type is bool, the unwanted positions have `False` values + and the others have `True` values. When the data type is int, + the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ), `incremental_cache` is an instance of `MultiHeadAttention.Cache`, `static_cache` is an instance of `MultiHeadAttention.StaticCache. @@ -836,6 +882,9 @@ class TransformerDecoderLayer(Layer): See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) + residual = tgt if self.normalize_before: tgt = self.norm1(tgt) @@ -958,18 +1007,23 @@ class TransformerDecoder(Layer): tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted - to `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + to `[batch_size, n_head, target_length, target_length]`. When + the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to - `[batch_size, n_head, target_length, source_length]`, where the - unwanted positions have `-INF` values and the others have 0 values. - The data type should be float32 or float64. It can be None when - nothing wanted or needed to be prevented attention to. Default None + `[batch_size, n_head, target_length, source_length]`. When the + data type is bool, the unwanted positions have `False` values + and the others have `True` values. When the data type is int, + the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. cache (list, optional): It is a list, and each element in the list is a tuple( :code:`(incremental_cache, static_cache)` ). See `TransformerDecoder.gen_cache` for more details. It is only @@ -984,6 +1038,9 @@ class TransformerDecoder(Layer): See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) + output = tgt new_caches = [] for i, mod in enumerate(self.layers): @@ -1222,27 +1279,46 @@ class Transformer(Layer): memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. + src_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted - to `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - Default None + to `[batch_size, n_head, target_length, target_length]`. When + the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to - `[batch_size, n_head, target_length, source_length]`, where the - unwanted positions have `-INF` values and the others have 0 values. - The data type should be float32 or float64. It can be None when - nothing wanted or needed to be prevented attention to. Default None + `[batch_size, n_head, target_length, source_length]`. When the + data type is bool, the unwanted positions have `False` values + and the others have `True` values. When the data type is int, + the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. Returns: Tensor: It is a tensor that has the same shape and data type \ as `tgt`, representing the output of Transformer decoder. """ + src_mask = _convert_attention_mask(src_mask, src.dtype) memory = self.encoder(src, src_mask=src_mask) + + tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) + memory_mask = _convert_attention_mask(memory_mask, memory.dtype) output = self.decoder( tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask) return output -- GitLab