From e2991555f1aad24952947e8418e03c040b0bf862 Mon Sep 17 00:00:00 2001 From: Feng Xing <79969986+xingfeng01@users.noreply.github.com> Date: Tue, 31 Aug 2021 18:58:05 +0800 Subject: [PATCH] transformer opt python files (#35206) This PR adds fused transformer python related files. It defines interface of fused transformer. Fused transformer implements an optimized version of transformer layer (in python/paddle/nn/layer/transformer.py). In this PR, four layers (functions) are defined: (1) FusedMultiHeadAttention: multi-head attention layer (2) FusedFeedForward: feed forward layer (3) FusedTransformerEncoderLayer: transformer encoder layer (4) FusedTransformer: transformer layer --- python/paddle/nn/layer/fused_transformer.py | 361 ++++++++++++++++++++ 1 file changed, 361 insertions(+) create mode 100644 python/paddle/nn/layer/fused_transformer.py diff --git a/python/paddle/nn/layer/fused_transformer.py b/python/paddle/nn/layer/fused_transformer.py new file mode 100644 index 0000000000..6b24346c2b --- /dev/null +++ b/python/paddle/nn/layer/fused_transformer.py @@ -0,0 +1,361 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class FusedMultiHeadAttention(Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + embed_dim (int): The expected feature size in the input and output. + num_heads (int): The number of heads in multi-head attention. + dropout (float, optional): The dropout probability used on attention + weights to drop some attention targets. 0 for no dropout. Default 0 + kdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + vdim (int, optional): The feature size in value. If None, assumed equal to + `embed_dim`. Default None. + need_weights (bool, optional): Indicate whether to return the attention + weights. Default False. + weight_attr(ParamAttr, optional): To specify the weight parameter property. + Default: None, which means the default weight parameter property is used. + See usage for details in :code:`ParamAttr` . + bias_attr (ParamAttr|bool, optional): To specify the bias parameter property. + Default: None, which means the default bias parameter property is used. + If it is set to False, this layer will not have trainable bias parameter. + See usage for details in :code:`ParamAttr` . + + Examples: + + .. code-block:: python + + import paddle + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, num_heads, query_len, query_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) + output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] + """ + + Cache = collections.namedtuple("Cache", ["k", "v"]) + StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=False, + weight_attr=None, + bias_attr=None): + super(FusedMultiHeadAttention, self).__init__() + + def forward(self, query, key=None, value=None, attn_mask=None, cache=None): + """ + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + Parameters: + query (Tensor): The queries for multi-head attention. It is a + tensor with shape `[batch_size, query_length, embed_dim]`. The + data type should be float32 or float64. + key (Tensor, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, key_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. Default None. + value (Tensor, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, value_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. Default None. + attn_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): + It is a namedtuple with `k` and `v` as fields, and stores tensors + shaped `[batch_size, num_heads, length, embed_dim]` which are results + of linear projection, reshape and transpose calculations in + MultiHeadAttention. If it is an instance of `Cache`, `k` and `v` + fields reserve intermediate results of previous positions, which + mostly used for decoder self attention. If it is an instance of + `StaticCache`, `key` and `value` args would be ignored, `k` and + `v` fields would be used as calculated results on `key` and + `value`, which mostly used for decoder-encoder cross attention. + It is only used for inference and should be None for training. + Default None. + Returns: + Tensor|tuple: It is a tensor that has the same shape and data type \ + as `query`, representing attention output. Or a tuple if \ + `need_weights` is True or `cache` is not None. If `need_weights` \ + is True, except for attention output, the tuple also includes \ + the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ + If `cache` is not None, the tuple then includes the new cache \ + having the same type as `cache`, and if it is `StaticCache`, it \ + is same as the input `cache`, if it is `Cache`, the new cache \ + reserves tensors concatanating raw tensors with intermediate \ + results of current query. + """ + pass + + +class FusedFeedForward(Layer): + def __init__(self, + d_model, + dim_feedforward, + dropout=0.1, + activation="relu", + act_dropout=None, + normalize_before=False, + weight_attr=None, + bias_attr=None): + + super(FusedFeedForward, self).__init__() + + def forward(self, src, cache=None): + pass + + +class FusedTransformerEncoderLayer(Layer): + """ + TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) + attention and feedforward network. Before and after each sub-layer, pre-process + and post-precess would be applied on the input and output accordingly. If + `normalize_before` is True, pre-process is layer normalization and post-precess + includes dropout, residual connection. Otherwise, no pre-process and post-precess + includes dropout, residual connection, layer normalization. + + Parameters: + d_model (int): The expected feature size in the input and output. + nhead (int): The number of heads in multi-head attention(MHA). + dim_feedforward (int): The hidden layer size in the feedforward network(FFN). + dropout (float, optional): The dropout probability used in pre-process + and post-precess of MHA and FFN sub-layer. Default 0.1 + activation (str, optional): The activation function in the feedforward + network. Default relu. + attn_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. If None, use the value of + `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + normalize_before (bool, optional): Indicate whether to put layer normalization + into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer + normalization and post-precess includes dropout, residual connection. + Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. Default False + weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property. + If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for + MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. + Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. + Default: None, which means the default weight parameter property is used. + See usage for details in :code:`ParamAttr` . + bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property. + If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for + MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. + Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. + The `False` value means the corresponding layer would not have trainable + bias parameter. See usage for details in :code:`ParamAttr` . Default: None, + which means the default bias parameter property is used. + + + Examples: + + .. code-block:: python + + import paddle + from paddle.nn import TransformerEncoderLayer + + # encoder input: [batch_size, src_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, n_head, src_len, src_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + encoder_layer = TransformerEncoderLayer(128, 2, 512) + enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False, + weight_attr=None, + bias_attr=None): + self._config = locals() + self._config.pop("self") + self._config.pop("__class__", None) # py3 + + super(FusedTransformerEncoderLayer, self).__init__() + + def forward(self, src, src_mask=None, cache=None): + """ + Applies a Transformer encoder layer on the input. + Parameters: + src (Tensor): The input of Transformer encoder layer. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + src_mask (Tensor, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. + When the data type is bool, the unwanted positions have `False` + values and the others have `True` values. When the data type is + int, the unwanted positions have 0 values and the others have 1 + values. When the data type is float, the unwanted positions have + `-INF` values and the others have 0 values. It can be None when + nothing wanted or needed to be prevented attention to. Default None. + cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. + See `TransformerEncoderLayer.gen_cache` for more details. It is + only used for inference and should be None for training. Default + None. + Returns: + Tensor|tuple: It is a tensor that has the same shape and data type \ + as `enc_input`, representing the output of Transformer encoder \ + layer. Or a tuple if `cache` is not None, except for encoder \ + layer output, the tuple includes the new cache which is same \ + as input `cache` argument but `incremental_cache` has an \ + incremental length. See `MultiHeadAttention.gen_cache` and \ + `MultiHeadAttention.forward` for more details. + """ + pass + + +class FusedTransformer(Layer): + """ + A Transformer model composed of an instance of `TransformerEncoder` and an + instance of `TransformerDecoder`. While the embedding layer and output layer + are not included. + + Please refer to `Attention is all you need `_ , + and see `TransformerEncoder` and `TransformerDecoder` for more details. + + Users can configurate the model architecture with corresponding parameters. + Note the usage of `normalize_before` representing where to apply layer + normalization (in pre-process or post-precess of multi-head attention or FFN), + and some transformer like models are different on this, such as + `BERT `_ and `GPT2 `_ . + The default architecture here places layer normalization in post-process and + applies another layer normalization on the output of last encoder/decoder layer. + + Parameters: + d_model (int, optional): The expected feature size in the encoder/decoder input + and output. Default 512 + nhead (int, optional): The number of heads in multi-head attention(MHA). Default 8 + num_encoder_layers (int, optional): The number of layers in encoder. Default 6 + num_decoder_layers (int, optional): The number of layers in decoder. Default 6 + dim_feedforward (int, optional): The hidden layer size in the feedforward network(FFN). Default 2048 + dropout (float, optional): The dropout probability used in pre-process + and post-precess of MHA and FFN sub-layer. Default 0.1 + activation (str, optional): The activation function in the feedforward + network. Default relu. + attn_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. If None, use the value of + `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + normalize_before (bool, optional): Indicate whether to put layer normalization + into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer + normalization and post-precess includes dropout, residual connection. + Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. Default False + weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property. + If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, + `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` + would be used as `weight_attr` for cross attention of `TransformerDecoder`, + and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. + If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention + and cross attntion and `weight_attr[1]` would be used as `weight_attr` for + linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` + for self attention, cross attention and linear in FFN. Otherwise, + the three sub-layers all uses it as `weight_attr` to create parameters. + Default: None, which means the default weight parameter property is used. + See usage for details + in :code:`ParamAttr` . + bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property. + If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, + `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` + would be used as `bias_attr` for cross attention of `TransformerDecoder`, + and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. + If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention + and cross attntion and `bias_attr[1]` would be used as `bias_attr` for + linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` + for self attention, cross attention and linear in FFN. Otherwise, + the three sub-layers all uses it as `bias_attr` to create parameters. + The `False` value means the corresponding layer would not have trainable + bias parameter. See usage for details in :code:`ParamAttr` . + Default: None,which means the default bias parameter property is used. + custom_encoder (Layer, optional): If custom encoder is provided, use it as the encoder. + Default None + custom_decoder (Layer, optional): If custom decoder is provided, use it as the decoder. + Default None + + Examples: + + .. code-block:: python + + import paddle + from paddle.nn import Transformer + + # src: [batch_size, tgt_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # tgt: [batch_size, src_len, d_model] + dec_input = paddle.rand((2, 6, 128)) + # src_mask: [batch_size, n_head, src_len, src_len] + enc_self_attn_mask = paddle.rand((2, 2, 4, 4)) + # tgt_mask: [batch_size, n_head, tgt_len, tgt_len] + dec_self_attn_mask = paddle.rand((2, 2, 6, 6)) + # memory_mask: [batch_size, n_head, tgt_len, src_len] + cross_attn_mask = paddle.rand((2, 2, 6, 4)) + transformer = Transformer(128, 2, 4, 4, 512) + output = transformer(enc_input, + dec_input, + enc_self_attn_mask, + dec_self_attn_mask, + cross_attn_mask) # [2, 6, 128] + """ + + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False, + weight_attr=None, + bias_attr=None, + custom_encoder=None, + custom_decoder=None): + super(fusedTransformer, self).__init__() + + def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): + pass -- GitLab