提交 1b0f8944 编写于 作者: V Varuna Jayasiri

spelling

上级 330df7e5
......@@ -9,17 +9,17 @@ from typing import Optional
import torch
from labml import tracker
from labml_helpers.module import Module
from torch import nn as nn
from torch.nn import functional as F
from labml_helpers.module import Module
class PrepareForMultiHeadAttention(Module):
"""
This module does a linear transformation and splits the vector into given
number of heads for multi-head attention.
"""
def __init__(self, d_model: int, heads: int, d_k: int, bias: bool):
super().__init__()
self.linear = nn.Linear(d_model, heads * d_k, bias=bias)
......@@ -38,7 +38,7 @@ class PrepareForMultiHeadAttention(Module):
class MultiHeadAttention(Module):
def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool=True):
def __init__(self, heads: int, d_model: int, dropout_prob: float = 0.1, bias: bool = True):
"""
## Multi-Head Attention Module
......
......@@ -12,7 +12,7 @@ from .positional_encoding import get_positional_encoding
class EmbeddingsWithPositionalEncoding(Module):
"""
## Embed tokenas and add [fixed positional encoding](positional_encoding.html)
## Embed tokens and add [fixed positional encoding](positional_encoding.html)
"""
def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
super().__init__()
......@@ -27,13 +27,13 @@ class EmbeddingsWithPositionalEncoding(Module):
class EmbeddingsWithLearnedPositionalEncoding(Module):
"""
## Embed tokenas and add parameterized positional encodings
## Embed tokens and add parameterized positional encodings
"""
def __init__(self, d_model: int, n_vocab: int, max_len: int = 5000):
super().__init__()
self.linear = nn.Embedding(n_vocab, d_model)
self.d_model = d_model
self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model))
self.positional_encodings = nn.Parameter(torch.zeros(max_len, 1, d_model), requires_grad=True)
def __call__(self, x: torch.Tensor):
pe = self.positional_encodings[:x.shape[0]]
......@@ -67,7 +67,7 @@ class TransformerLayer(Module):
in where the layer-normalization is done.
Here we do a layer normalization before attention and feed-forward networks,
and add the original residual vectors.
Alternative is to do a layer normalzation after adding the residuals.
Alternative is to do a layer normalization after adding the residuals.
But we found this to be less stable when training.
We found a detailed discussion about this in paper
[On Layer Normalization in the Transformer Architecture](https://arxiv.org/abs/2002.04745).
......@@ -162,7 +162,7 @@ class Generator(Module):
"""
## Generator
This predicts the tokens and gives the lof softmaxes of those.
This predicts the tokens and gives the lof softmax of those.
You don't need this if you are using `nn.CrossEntropyLoss`.
"""
def __init__(self, n_vocab: int, d_model: int):
......
......@@ -9,8 +9,8 @@ PE_{p,2i} &= sin\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg) \\
PE_{p,2i + 1} &= cos\Bigg(\frac{p}{10000^{\frac{2i}{d_{model}}}}\Bigg)
\end{align}
Where $1 \leq 2i, 2i + 1 \leq d_{model}$ are the feature indexes in the encoding,
and $p$ is the position.
Where $1 \leq 2i, 2i + 1 \leq d_{model}$
are the feature indexes in the encoding, and $p$ is the position.
"""
import math
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册