提交 1b0f8944 编写于 作者: V Varuna Jayasiri

spelling

上级 330df7e5
...@@ -9,17 +9,17 @@ from typing import Optional ...@@ -9,17 +9,17 @@ from typing import Optional
import torch import torch
from labml import tracker from labml import tracker
from labml_helpers.module import Module
from torch import nn as nn from torch import nn as nn
from torch.nn import functional as F from torch.nn import functional as F
from labml_helpers.module import Module
class PrepareForMultiHeadAttention(Module): class PrepareForMultiHeadAttention(Module):
""" """
This module does a linear transformation and splits the vector into given This module does a linear transformation and splits the vector into given
number of heads for multi-head attention. number of heads for multi-head attention.
""" """
def __init__(self, d_model: int, heads: int, d_k: int, bias: bool): def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
super().__init__() super().__init__()
self.linear = nn.Linear(d_model, heads * d_k, bias=bias) self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
...@@ -38,7 +38,7 @@ class PrepareForMultiHeadAttention(Module): ...@@ -38,7 +38,7 @@ class PrepareForMultiHeadAttention(Module):
class MultiHeadAttention(Module): class MultiHeadAttention(Module):
def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool=True): def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool = True):
""" """
## Multi-Head Attention Module ## Multi-Head Attention Module
......
...@@ -12,7 +12,7 @@ from .positional_encoding import get_positional_encoding ...@@ -12,7 +12,7 @@ from .positional_encoding import get_positional_encoding
class EmbeddingsWithPositionalEncoding(Module): class EmbeddingsWithPositionalEncoding(Module):
""" """
## Embed tokenas and add [fixed positional encoding](positional_encoding.html) ## Embed tokens and add [fixed positional encoding](positional_encoding.html)
""" """
def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000): def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
super().__init__() super().__init__()
...@@ -27,13 +27,13 @@ class EmbeddingsWithPositionalEncoding(Module): ...@@ -27,13 +27,13 @@ class EmbeddingsWithPositionalEncoding(Module):
class EmbeddingsWithLearnedPositionalEncoding(Module): class EmbeddingsWithLearnedPositionalEncoding(Module):
""" """
## Embed tokenas and add parameterized positional encodings ## Embed tokens and add parameterized positional encodings
""" """
def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000): def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
super().__init__() super().__init__()
self.linear = nn.Embedding(n_vocab, d_model) self.linear = nn.Embedding(n_vocab, d_model)
self.d_model = d_model self.d_model = d_model
self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model)) self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)
def __call__(self, x: torch.Tensor): def __call__(self, x: torch.Tensor):
pe = self.positional_encodings[:x.shape[0]] pe = self.positional_encodings[:x.shape[0]]
...@@ -67,7 +67,7 @@ class TransformerLayer(Module): ...@@ -67,7 +67,7 @@ class TransformerLayer(Module):
in where the layer-normalization is done. in where the layer-normalization is done.
Here we do a layer normalization before attention and feed-forward networks, Here we do a layer normalization before attention and feed-forward networks,
and add the original residual vectors. and add the original residual vectors.
Alternative is to do a layer normalzation after adding the residuals. Alternative is to do a layer normalization after adding the residuals.
But we found this to be less stable when training. But we found this to be less stable when training.
We found a detailed discussion about this in paper We found a detailed discussion about this in paper
[On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745). [On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
...@@ -162,7 +162,7 @@ class Generator(Module): ...@@ -162,7 +162,7 @@ class Generator(Module):
""" """
## Generator ## Generator
This predicts the tokens and gives the lof softmaxes of those. This predicts the tokens and gives the lof softmax of those.
You don't need this if you are using `nn.CrossEntropyLoss`. You don't need this if you are using `nn.CrossEntropyLoss`.
""" """
def __init__(self, n_vocab: int, d_model: int): def __init__(self, n_vocab: int, d_model: int):
......
...@@ -9,8 +9,8 @@ PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\ ...@@ -9,8 +9,8 @@ PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
\end{align} \end{align}
Where $1 \leq 2i, 2i + 1 \leq d_{model}$ are the feature indexes in the encoding, Where $1 \leq 2i, 2i + 1 \leq d_{model}$
and $p$ is the position. are the feature indexes in the encoding, and $p$ is the position.
""" """
import math import math
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册