From 1b0f8944bded49da536fe9d18eac70154d1c273f Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri <vpjayasiri@gmail.com>
Date: Sat, 5 Sep 2020 13:25:27 +0530
Subject: [PATCH] spelling

---
 labml_nn/transformers/mha.py                 |  6 +++---
 labml_nn/transformers/models.py              | 10 +++++-----
 labml_nn/transformers/positional_encoding.py |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index 4f635e79..065a4ba6 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -9,17 +9,17 @@ from typing import Optional
 
 import torch
 from labml import tracker
+from labml_helpers.module import Module
 from torch import nn as nn
 from torch.nn import functional as F
 
-from labml_helpers.module import Module
-
 
 class PrepareForMultiHeadAttention(Module):
     """
     This module does a linear transformation and splits the vector into given
     number of heads for multi-head attention.
     """
+
     def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
         super().__init__()
         self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
@@ -38,7 +38,7 @@ class PrepareForMultiHeadAttention(Module):
 
 
 class MultiHeadAttention(Module):
-    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool=True):
+    def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool = True):
         """
         ## Multi-Head Attention Module
 
diff --git a/labml_nn/transformers/models.py b/labml_nn/transformers/models.py
index 361477b4..e2f71f32 100644
--- a/labml_nn/transformers/models.py
+++ b/labml_nn/transformers/models.py
@@ -12,7 +12,7 @@ from .positional_encoding import get_positional_encoding
 
 class EmbeddingsWithPositionalEncoding(Module):
     """
-    ## Embed tokenas and add [fixed positional encoding](positional_encoding.html)
+    ## Embed tokens and add [fixed positional encoding](positional_encoding.html)
     """
     def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
         super().__init__()
@@ -27,13 +27,13 @@ class EmbeddingsWithPositionalEncoding(Module):
 
 class EmbeddingsWithLearnedPositionalEncoding(Module):
     """
-    ## Embed tokenas and add parameterized positional encodings
+    ## Embed tokens and add parameterized positional encodings
     """
     def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
         super().__init__()
         self.linear = nn.Embedding(n_vocab, d_model)
         self.d_model = d_model
-        self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model))
+        self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)
 
     def __call__(self, x: torch.Tensor):
         pe = self.positional_encodings[:x.shape[0]]
@@ -67,7 +67,7 @@ class TransformerLayer(Module):
     in where the layer-normalization is done.
     Here we do a layer normalization before attention and feed-forward networks,
     and add the original residual vectors.
-    Alternative is to do a layer normalzation after adding the residuals.
+    Alternative is to do a layer normalization after adding the residuals.
     But we found this to be less stable when training.
     We found a detailed discussion about this in paper
      [On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
@@ -162,7 +162,7 @@ class Generator(Module):
     """
     ## Generator
 
-    This predicts the tokens and gives the lof softmaxes of those.
+    This predicts the tokens and gives the lof softmax of those.
     You don't need this if you are using `nn.CrossEntropyLoss`.
     """
     def __init__(self, n_vocab: int, d_model: int):
diff --git a/labml_nn/transformers/positional_encoding.py b/labml_nn/transformers/positional_encoding.py
index 82957b46..d3f4f9f1 100644
--- a/labml_nn/transformers/positional_encoding.py
+++ b/labml_nn/transformers/positional_encoding.py
@@ -9,8 +9,8 @@ PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
 PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
 \end{align}
 
-Where $1 \leq 2i, 2i + 1 \leq d_{model}$ are the feature indexes in the encoding,
-and $p$ is the position.
+Where $1 \leq 2i, 2i + 1 \leq d_{model}$
+ are the feature indexes in the encoding, and $p$ is the position.
 """
 
 import math
-- 
GitLab