diff --git a/labml_nn/transformers/__init__.py b/labml_nn/transformers/__init__.py
index 7bf6c9cfa0e9828b0bd91d2a0b080e17b7574559..52ac31d6c63b083f59664a559dbc8297d547629f 100644
--- a/labml_nn/transformers/__init__.py
+++ b/labml_nn/transformers/__init__.py
@@ -1,6 +1,4 @@
 """
-<a class="github-button" href="https://github.com/lab-ml/labml_nn" data-size="large" data-show-count="true" aria-label="Star lab-ml/labml_nn on GitHub">Star</a>
-
 # Transformers
 
 * [Multi-head attention](mha.html)
diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index 863ba3515131a9f633123c4a9c69ecda5e235eaf..4f635e7967a22915a32f18c1cb6126df2bffce71 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -1,6 +1,4 @@
 """
-<a class="github-button" href="https://github.com/lab-ml/labml_nn" data-size="large" data-show-count="true" aria-label="Star lab-ml/labml_nn on GitHub">Star</a>
-
 # Multi-Headed Attention
 
 The implementation is inspired from [Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html)
diff --git a/labml_nn/transformers/models.py b/labml_nn/transformers/models.py
index e9aaa81a109820f111e520e9f9dac9534fa448ac..361477b4bf6179b0eb2704e8190fb18d896984f7 100644
--- a/labml_nn/transformers/models.py
+++ b/labml_nn/transformers/models.py
@@ -11,6 +11,9 @@ from .positional_encoding import get_positional_encoding
 
 
 class EmbeddingsWithPositionalEncoding(Module):
+    """
+    ## Embed tokenas and add [fixed positional encoding](positional_encoding.html)
+    """
     def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
         super().__init__()
         self.linear = nn.Embedding(n_vocab, d_model)
@@ -23,6 +26,9 @@ class EmbeddingsWithPositionalEncoding(Module):
 
 
 class EmbeddingsWithLearnedPositionalEncoding(Module):
+    """
+    ## Embed tokenas and add parameterized positional encodings
+    """
     def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
         super().__init__()
         self.linear = nn.Embedding(n_vocab, d_model)
@@ -35,6 +41,9 @@ class EmbeddingsWithLearnedPositionalEncoding(Module):
 
 
 class FeedForward(Module):
+    """
+    ## Position-wise feed-forward network with hidden layer
+    """
     def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
         super().__init__()
         self.layer1 = nn.Linear(d_model, d_ff)
@@ -49,6 +58,20 @@ class FeedForward(Module):
 
 
 class TransformerLayer(Module):
+    """
+    ## Transformer Layer
+
+    This can act as a encoder layer or a decoder layer.
+
+    🗒 Some implementations, including the paper seem to have differences
+    in where the layer-normalization is done.
+    Here we do a layer normalization before attention and feed-forward networks,
+    and add the original residual vectors.
+    Alternative is to do a layer normalzation after adding the residuals.
+    But we found this to be less stable when training.
+    We found a detailed discussion about this in paper
+     [On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
+    """
     def __init__(self, *,
                  d_model: int,
                  self_attn: MultiHeadAttention,
@@ -71,47 +94,77 @@ class TransformerLayer(Module):
                  mask: torch.Tensor,
                  src: torch.Tensor = None,
                  src_mask: torch.Tensor = None):
+        # Normalize the vectors before doing self attention
         z = self.norm_self_attn(x)
-        attn_self = self.self_attn(query=z, key=z, value=z, mask=mask)
-        x = x + self.dropout(attn_self)
-
+        # Run through self attention, i.e. keys and values are from self
+        self_attn = self.self_attn(query=z, key=z, value=z, mask=mask)
+        # Add the self attention results
+        x = x + self.dropout(self_attn)
+
+        # If a source is provided, get results from attention to source.
+        # This is when you have a decoder layer that pays attention to 
+        # encoder outputs
         if src is not None:
+            # Normalize vectors
             z = self.norm_src_attn(x)
+            # Attention to source. i.e. keys and values are from source
             attn_src = self.src_attn(query=z, key=src, value=src, mask=src_mask)
+            # Add the source attention results
             x = x + self.dropout(attn_src)
 
+        # Normalize for feed-forward
         z = self.norm_ff(x)
+        # Pass through the feed-forward network
         ff = self.feed_forward(z)
+        # Add the feed-forward results back
         x = x + self.dropout(ff)
 
         return x
 
 
 class Encoder(Module):
+    """
+    ## Transformer Encoder
+    """
     def __init__(self, layer: TransformerLayer, n_layers: int):
         super().__init__()
+        # Make copies of the transformer layer
         self.layers = clone_module_list(layer, n_layers)
         self.norm = nn.LayerNorm([layer.size])
 
     def __call__(self, x: torch.Tensor, mask: torch.Tensor):
+        # Run through each transformer layer
         for layer in self.layers:
             x = layer(x=x, mask=mask)
+        # Finally, normalize the vectors
         return self.norm(x)
 
 
 class Decoder(Module):
+    """
+    ## Transformer Decoder
+    """
     def __init__(self, layer: TransformerLayer, n_layers: int):
         super().__init__()
+        # Make copies of the transformer layer
         self.layers = clone_module_list(layer, n_layers)
         self.norm = nn.LayerNorm([layer.size])
 
-    def __call__(self, x, memory, src_mask, tgt_mask):
+    def __call__(self, x: torch.Tensor, memory: torch.Tensor, src_mask: torch.Tensor, tgt_mask: torch.Tensor):
+        # Run through each transformer layer
         for layer in self.layers:
             x = layer(x=x, mask=tgt_mask, src=memory, src_mask=src_mask)
+        # Finally, normalize the vectors
         return self.norm(x)
 
 
 class Generator(Module):
+    """
+    ## Generator
+
+    This predicts the tokens and gives the lof softmaxes of those.
+    You don't need this if you are using `nn.CrossEntropyLoss`.
+    """
     def __init__(self, n_vocab: int, d_model: int):
         super().__init__()
         self.projection = nn.Linear(d_model, n_vocab)
@@ -121,6 +174,9 @@ class Generator(Module):
 
 
 class EncoderDecoder(Module):
+    """
+    ## Combined Encoder-Decoder
+    """
     def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: Module, tgt_embed: Module, generator: Module):
         super().__init__()
         self.encoder = encoder
@@ -135,10 +191,11 @@ class EncoderDecoder(Module):
             if p.dim() > 1:
                 nn.init.xavier_uniform_(p)
 
-    def __call__(self, src: torch.Tensor, tgt: torch.Tensor, src_mask: torch.Tensor,
-                 tgt_mask: torch.Tensor):
-        return self.decode(self.encode(src, src_mask), src_mask,
-                           tgt, tgt_mask)
+    def __call__(self, src: torch.Tensor, tgt: torch.Tensor, src_mask: torch.Tensor, tgt_mask: torch.Tensor):
+        # Runs the source through encoder
+        enc = self.encode(src, src_mask)
+        # Run encodings and targets through decoder
+        return self.decode(enc, src_mask, tgt, tgt_mask)
 
     def encode(self, src: torch.Tensor, src_mask: torch.Tensor):
         return self.encoder(self.src_embed(src), src_mask)
diff --git a/labml_nn/transformers/positional_encoding.py b/labml_nn/transformers/positional_encoding.py
index 731fc0084e76c05bcfe7cabc736695e8e2ddb497..82957b46b45da3ba689a66e5cea836acd71ede64 100644
--- a/labml_nn/transformers/positional_encoding.py
+++ b/labml_nn/transformers/positional_encoding.py
@@ -1,3 +1,18 @@
+"""
+# Fixed Positional Encodings
+
+The positional encoding encodes the position along the sequence into
+ a vector of size `d_model`.
+
+\begin{align}
+PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
+PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
+\end{align}
+
+Where $1 \leq 2i, 2i + 1 \leq d_{model}$ are the feature indexes in the encoding,
+and $p$ is the position.
+"""
+
 import math
 
 import matplotlib.pyplot as plt
@@ -23,12 +38,20 @@ class PositionalEncoding(Module):
 
 
 def get_positional_encoding(d_model: int, max_len: int = 5000):
+    # Empty encodings vectors
     encodings = torch.zeros(max_len, d_model)
+    # Position indexes
     position = torch.arange(0, max_len, dtype=torch.float32).unsqueeze(1)
+    # $2 * i$
     two_i = torch.arange(0, d_model, 2, dtype=torch.float32)
+    # $10000^{\frac{2i}{d_{model}}$
     div_term = torch.exp(two_i * -(math.log(10000.0) / d_model))
+    # $PE_{p,2i} = sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
     encodings[:, 0::2] = torch.sin(position * div_term)
+    # $PE_{p,2i + 1} = cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)$
     encodings[:, 1::2] = torch.cos(position * div_term)
+
+    # Add batch dimension
     encodings = encodings.unsqueeze(1).requires_grad_(False)
 
     return encodings
diff --git a/labml_nn/transformers/relative_mha.py b/labml_nn/transformers/relative_mha.py
index 43d61aeff593f3067bcfb69dd841567ed422e362..1f6f8bb602d858b42c5be4a53dc37e7f8428cdc2 100644
--- a/labml_nn/transformers/relative_mha.py
+++ b/labml_nn/transformers/relative_mha.py
@@ -1,6 +1,4 @@
 """
-<a class="github-button" href="https://github.com/lab-ml/labml_nn" data-size="large" data-show-count="true" aria-label="Star lab-ml/labml_nn on GitHub">Star</a>
-
 # Relative Multi-head Attention
 
 This is an implementation of