import math import torch from torch import nn from pytorch_widedeep.wdtypes import Union, Tensor, Optional from pytorch_widedeep.utils.general_utils import Alias from pytorch_widedeep.models.tabular.transformers._encoders import ( TransformerEncoder, ) class Transformer(nn.Module): r"""Basic Encoder-Only Transformer Model for text classification/regression. As all other models in the library this model can be used as the `deeptext` component of a Wide & Deep model or independently by itself. :information_source: **NOTE**: This model is introduced in the context of recommendation systems and thought for sequences of any nature (e.g. items). It can, of course, still be used for text. However, at this stage, we have decided to not include the possibility of loading pretrained word vectors since we aim to integrate the library wit Huggingface in the (hopefully) near future Parameters ---------- vocab_size: int Number of words in the vocabulary input_dim: int Dimension of the token embeddings Param aliases: `embed_dim`, `d_model`.
seq_length: int, Optional, default = None Input sequence length n_heads: int, default = 8 Number of attention heads per Transformer block n_blocks: int, default = 4 Number of Transformer blocks attn_dropout: float, default = 0.2 Dropout that will be applied to the Multi-Head Attention layers ff_dropout: float, default = 0.1 Dropout that will be applied to the FeedForward network ff_factor: float, default = 4 Multiplicative factor applied to the first layer of the FF network in each Transformer block, This is normally set to 4. activation: str, default = "gelu" Transformer Encoder activation function. _'tanh'_, _'relu'_, _'leaky_relu'_, _'gelu'_, _'geglu'_ and _'reglu'_ are supported padding_idx: int, default = 0 index of the padding token in the padded-tokenised sequences. with_cls_token: bool, default = False Boolean indicating if a `'[CLS]'` token is included in the tokenized sequences. If present, the final hidden state corresponding to this token is used as the aggregated representation for classification and regression tasks. **NOTE**: if included in the tokenized sequences it must be inserted as the first token in the sequences. with_pos_encoding: bool, default = True Boolean indicating if positional encoding will be used pos_encoding_dropout: float, default = 0.1 Positional encoding dropout pos_encoder: nn.Module, Optional, default = None This model uses by default a standard positional encoding approach. However, any custom positional encoder can also be used and pass to the Transformer model via the 'pos_encoder' parameter Attributes ---------- embedding: nn.Module Standard token embedding layer pos_encoder: nn.Module Positional Encoder encoder: nn.Module Sequence of Transformer blocks Examples -------- >>> import torch >>> from pytorch_widedeep.models import Transformer >>> X_text = torch.cat((torch.zeros([5,1]), torch.empty(5, 4).random_(1,4)), axis=1) >>> model = Transformer(vocab_size=4, seq_length=5, input_dim=8, n_heads=1, n_blocks=1) >>> out = model(X_text) """ @Alias("input_dim", ["embed_dim", "d_model"]) @Alias("seq_length", ["max_length", "maxlen"]) def __init__( self, vocab_size: int, seq_length: int, input_dim: int, n_heads: int, n_blocks: int, attn_dropout: float = 0.1, ff_dropout: float = 0.1, ff_factor: int = 4, activation: str = "gelu", use_linear_attention: bool = False, use_flash_attention: bool = False, padding_idx: int = 0, with_cls_token: bool = False, *, # from here on pos encoding args with_pos_encoding: bool = True, pos_encoding_dropout: float = 0.1, pos_encoder: Optional[nn.Module] = None, ): super().__init__() self.input_dim = input_dim self.seq_length = seq_length self.n_heads = n_heads self.n_blocks = n_blocks self.attn_dropout = attn_dropout self.ff_dropout = ff_dropout self.ff_factor = ff_factor self.activation = activation self.use_linear_attention = use_linear_attention self.use_flash_attention = use_flash_attention self.padding_idx = padding_idx self.with_cls_token = with_cls_token self.with_pos_encoding = with_pos_encoding self.pos_encoding_dropout = pos_encoding_dropout self.embedding = nn.Embedding( vocab_size, input_dim, padding_idx=self.padding_idx ) if with_pos_encoding: if pos_encoder is not None: self.pos_encoder: Union[ nn.Module, nn.Identity, PositionalEncoding ] = pos_encoder else: self.pos_encoder = PositionalEncoding( input_dim, pos_encoding_dropout, seq_length ) else: self.pos_encoder = nn.Identity() self.encoder = nn.Sequential() for i in range(n_blocks): self.encoder.add_module( "transformer_block" + str(i), TransformerEncoder( input_dim, n_heads, False, # use_qkv_bias attn_dropout, ff_dropout, ff_factor, activation, use_linear_attention, use_flash_attention, ), ) def forward(self, X: Tensor) -> Tensor: x = self.embedding(X.long()) x = self.pos_encoder(x) x = self.encoder(x) if self.with_cls_token: x = x[:, 0, :] else: x = x.flatten(1) return x @property def output_dim(self) -> int: if self.with_cls_token: output_dim = self.input_dim else: output_dim = self.input_dim * self.seq_length return output_dim class PositionalEncoding(nn.Module): """Positional Encoding copied and pasted directly from [The Beginners' Tutorial] (https://pytorch.org/tutorials/beginner/transformer_tutorial.html) at the Pytorch site. Here is simply adapated so that the input sequence length must be specified and in our implementation the input tensor dimensions are arranged as `[batch_size, seq_len, embedding_dim]` instead of ` [seq_len, batch_size, embedding_dim]` , as in the before mentioned tutorial Parameters ---------- input_dim: int Dimension of the token embeddings dropout: float Positional encoding dropout seq_length: int input sequence length """ def __init__(self, input_dim: int, dropout: float, seq_length: int): super().__init__() self.dropout = nn.Dropout(p=dropout) position = torch.arange(seq_length).unsqueeze(1) div_term = torch.exp( torch.arange(0, input_dim, 2) * (-math.log(10000.0) / input_dim) ) pe = torch.zeros(1, seq_length, input_dim) pe[0, :, 0::2] = torch.sin(position * div_term) pe[0, :, 1::2] = torch.cos(position * div_term) self.register_buffer("pe", pe) def forward(self, X: Tensor) -> Tensor: return self.dropout(X + self.pe)