spelling

1b0f8944 · Varuna Jayasiri · 330df7e5 · 1b0f8944 · 1b0f8944 · 1b0f8944
3 changed file
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -9,17 +9,17 @@ from typing import Optional

 import torch
 from labml import tracker
+from labml_helpers.module import Module
 from torch import nn as nn
 from torch.nn import functional as F

-from labml_helpers.module import Module
-

 class PrepareForMultiHeadAttention(Module):
    """
    This module does a linear transformation and splits the vector into given
    number of heads for multi-head attention.
    """
+
    def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
        super().__init__()
        self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
@@ -38,7 +38,7 @@ class PrepareForMultiHeadAttention(Module):


 class MultiHeadAttention(Module):
-    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool=True):
+    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool = True):
        """
        ## Multi-Head Attention Module


--- a/labml_nn/transformers/models.py
+++ b/labml_nn/transformers/models.py
@@ -12,7 +12,7 @@ from .positional_encoding import get_positional_encoding

 class EmbeddingsWithPositionalEncoding(Module):
    """
-    ## Embed tokenas and add [fixed positional encoding](positional_encoding.html)
+    ## Embed tokens and add [fixed positional encoding](positional_encoding.html)
    """
    def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
        super().__init__()
@@ -27,13 +27,13 @@ class EmbeddingsWithPositionalEncoding(Module):

 class EmbeddingsWithLearnedPositionalEncoding(Module):
    """
-    ## Embed tokenas and add parameterized positional encodings
+    ## Embed tokens and add parameterized positional encodings
    """
    def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
        super().__init__()
        self.linear = nn.Embedding(n_vocab, d_model)
        self.d_model = d_model
-        self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model))
+        self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)

    def __call__(self, x: torch.Tensor):
        pe = self.positional_encodings[:x.shape[0]]
@@ -67,7 +67,7 @@ class TransformerLayer(Module):
    in where the layer-normalization is done.
    Here we do a layer normalization before attention and feed-forward networks,
    and add the original residual vectors.
-    Alternative is to do a layer normalzation after adding the residuals.
+    Alternative is to do a layer normalization after adding the residuals.
    But we found this to be less stable when training.
    We found a detailed discussion about this in paper
     [On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
@@ -162,7 +162,7 @@ class Generator(Module):
    """
    ## Generator

-    This predicts the tokens and gives the lof softmaxes of those.
+    This predicts the tokens and gives the lof softmax of those.
    You don't need this if you are using `nn.CrossEntropyLoss`.
    """
    def __init__(self, n_vocab: int, d_model: int):

--- a/labml_nn/transformers/positional_encoding.py
+++ b/labml_nn/transformers/positional_encoding.py
@@ -9,8 +9,8 @@ PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
 PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
 \end{align}

-Where $1 \leq 2i, 2i + 1 \leq d_{model}$ are the feature indexes in the encoding,
-and $p$ is the position.
+Where $1 \leq 2i, 2i + 1 \leq d_{model}$
+ are the feature indexes in the encoding, and $p$ is the position.
 """

 import math