From 969df7190d5c584f04e3ae69aaf15cf9d2a29b8b Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Fri, 19 Feb 2021 14:53:44 +0530 Subject: [PATCH] documentation fixes --- docs/transformers/compressive/experiment.html | 8 +- docs/transformers/compressive/index.html | 237 +++++++++--------- labml_nn/transformers/compressive/__init__.py | 31 +-- .../transformers/compressive/experiment.py | 8 +- 4 files changed, 143 insertions(+), 141 deletions(-) diff --git a/docs/transformers/compressive/experiment.html b/docs/transformers/compressive/experiment.html index 201f6a53..94b8fbe8 100644 --- a/docs/transformers/compressive/experiment.html +++ b/docs/transformers/compressive/experiment.html @@ -204,7 +204,7 @@ -

Length of the memory (for masks)

+

Total length of the memory and compressed memory (for masks)

58        m_len = len(mem[0]) if mem else 0
@@ -311,7 +311,7 @@
                     #
                 

Configurations

-

The default configs can and will be over-ridden when we start the experiment.

+

The default configurations can and will be overridden when we start the experiment.

87class Configs(NLPAutoRegressionConfigs):
@@ -512,7 +512,7 @@ -

If it’s configured not to use memory

+

If the configurations specify not to use memory

136        if self.mem_len == 0 and self.c_mem_len == 0:
@@ -726,7 +726,7 @@ and $N_m$ is the maximum number of memories we maintain (mem_len).<
                     #
                 

Return memories and the memories that were compressed. -Memories that were compressed is needed for the reconstruction loss computation.

+Memories that were compressed are needed for the reconstruction loss computation.

195        return CompressedMemory(mem, c_mem), mem_to_compress
diff --git a/docs/transformers/compressive/index.html b/docs/transformers/compressive/index.html index c3046047..3992f882 100644 --- a/docs/transformers/compressive/index.html +++ b/docs/transformers/compressive/index.html @@ -84,40 +84,41 @@ $n_{cm}$ memories, where $c$ is the compression rate.

The compression operation is defined as $f_c: \mathbb{R}^{nc \times d} \rightarrow \mathbb{R}^{n \times d}$. The paper introduces multiple choices for $f_c$ and we have only implemented -1D convolution which seems to give best results. +1D convolution which seems to give the best results. Each layer has a separate compression operation $f_c^{(i)}$ where $i$ is the layer number.

Training compression operation

Since training compression with BPTT requires maintaining -a very large computational graph (many time steps), paper proposes +a very large computational graph (many time steps), the paper proposes an auto-encoding loss and an attention reconstruction loss. -The auto-encoding loss, decodes the original memories from the compressed memories, -and calculate the loss. +The auto-encoding loss decodes the original memories from the compressed memories +and calculates the loss. Attention reconstruction loss computes the multi-headed attention results -on the compressed memory and on uncompressed memory and get a mean squared error +on the compressed memory and on uncompressed memory and gets a mean squared error between them. We have implemented the latter here since it gives better results.

-

This implementation uses pre-layer norm while the paper uses post-layer norm. +

This implementation uses pre-layer normalization +while the paper uses post-layer normalization. Pre-layer norm does the layer norm before FFN[../feedforward.html) and -self attention, and the pass through in the residual connection is not normalized. +self-attention, and the pass-through in the residual connection is not normalized. This is supposed to be more stable in standard transformer setups.

-

Here’s the training code and a notebook for training a compressive transformer -model on Tiny Shakespeare dataset.

+

Here are the training code and a notebook for training a compressive transformer +model on the Tiny Shakespeare dataset.

Open In Colab View Run

-
53from typing import Optional, List
-54
-55import torch
-56import torch.nn.functional as F
-57from torch import nn
-58
-59from labml_helpers.module import Module, TypedModuleList
-60from labml_nn.transformers.feed_forward import FeedForward
-61from labml_nn.transformers.mha import PrepareForMultiHeadAttention
-62from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention
-63from labml_nn.utils import clone_module_list
+
54from typing import Optional, List
+55
+56import torch
+57import torch.nn.functional as F
+58from torch import nn
+59
+60from labml_helpers.module import Module, TypedModuleList
+61from labml_nn.transformers.feed_forward import FeedForward
+62from labml_nn.transformers.mha import PrepareForMultiHeadAttention
+63from labml_nn.transformers.xl.relative_mha import RelativeMultiHeadAttention
+64from labml_nn.utils import clone_module_list
@@ -131,7 +132,7 @@ model on Tiny Shakespeare dataset.

with some tensor dimension permutations.

-
66class Conv1dCompression(Module):
+
67class Conv1dCompression(Module):
@@ -145,7 +146,7 @@ with some tensor dimension permutations.

-
74    def __init__(self, compression_rate: int, d_model: int):
+
75    def __init__(self, compression_rate: int, d_model: int):
@@ -156,8 +157,8 @@ with some tensor dimension permutations.

-
79        super().__init__()
-80        self.conv = nn.Conv1d(d_model, d_model, kernel_size=compression_rate, stride=compression_rate)
+
80        super().__init__()
+81        self.conv = nn.Conv1d(d_model, d_model, kernel_size=compression_rate, stride=compression_rate)
@@ -168,7 +169,7 @@ with some tensor dimension permutations.

mem has shape [seq_len, batch, d_model]

-
82    def forward(self, mem: torch.Tensor):
+
83    def forward(self, mem: torch.Tensor):
@@ -180,7 +181,7 @@ with some tensor dimension permutations.

The convolution layer accepts in the form [batch, features, sequence]

-
89        mem = mem.permute(1, 2, 0)
+
90        mem = mem.permute(1, 2, 0)
@@ -191,7 +192,7 @@ The convolution layer accepts in the form [batch, features, sequence]Get compressed memory by running it through the convolution layer

-
91        c_mem = self.conv(mem)
+
92        c_mem = self.conv(mem)
@@ -202,7 +203,7 @@ The convolution layer accepts in the form [batch, features, sequence]Permute back to form [seq_len, batch, d_model]

-
93        return c_mem.permute(2, 0, 1)
+
94        return c_mem.permute(2, 0, 1)
@@ -214,7 +215,7 @@ The convolution layer accepts in the form [batch, features, sequence]This is the implementation of a single compressive transformer layer

-
96class CompressiveTransformerLayer(Module):
+
97class CompressiveTransformerLayer(Module):
@@ -231,12 +232,12 @@ The convolution layer accepts in the form [batch, features, sequence]
-
102    def __init__(self, *,
-103                 d_model: int,
-104                 self_attn: RelativeMultiHeadAttention,
-105                 feed_forward: FeedForward,
-106                 dropout_prob: float,
-107                 compress: Conv1dCompression):
+
103    def __init__(self, *,
+104                 d_model: int,
+105                 self_attn: RelativeMultiHeadAttention,
+106                 feed_forward: FeedForward,
+107                 dropout_prob: float,
+108                 compress: Conv1dCompression):
@@ -247,14 +248,14 @@ The convolution layer accepts in the form [batch, features, sequence]
-
115        super().__init__()
-116        self.compress = compress
-117        self.size = d_model
-118        self.self_attn = self_attn
-119        self.feed_forward = feed_forward
-120        self.dropout = nn.Dropout(dropout_prob)
-121        self.norm_self_attn = nn.LayerNorm([d_model])
-122        self.norm_ff = nn.LayerNorm([d_model])
+
116        super().__init__()
+117        self.compress = compress
+118        self.size = d_model
+119        self.self_attn = self_attn
+120        self.feed_forward = feed_forward
+121        self.dropout = nn.Dropout(dropout_prob)
+122        self.norm_self_attn = nn.LayerNorm([d_model])
+123        self.norm_ff = nn.LayerNorm([d_model])
@@ -269,7 +270,7 @@ The convolution layer accepts in the form [batch, features, sequence]
-
124    def concat_memory(self, z: torch.Tensor, mem: Optional[torch.Tensor], c_mem: Optional[torch.Tensor]):
+
125    def concat_memory(self, z: torch.Tensor, mem: Optional[torch.Tensor], c_mem: Optional[torch.Tensor]):
@@ -280,8 +281,8 @@ The convolution layer accepts in the form [batch, features, sequence]If there is no memory just return the token embeddings

-
133        if mem is None:
-134            return z
+
134        if mem is None:
+135            return z
@@ -292,8 +293,8 @@ The convolution layer accepts in the form [batch, features, sequence]If there are compressed memory concatenate that with memory

-
137        if c_mem is not None:
-138            mem = torch.cat((c_mem, mem), dim=0)
+
138        if c_mem is not None:
+139            mem = torch.cat((c_mem, mem), dim=0)
@@ -304,7 +305,7 @@ The convolution layer accepts in the form [batch, features, sequence]Run the memory through the normalization layer

-
141        mem = self.norm_self_attn(mem)
+
142        mem = self.norm_self_attn(mem)
@@ -315,7 +316,7 @@ The convolution layer accepts in the form [batch, features, sequence]Concatenate normalized memory and normalized token embeddings

-
143        return torch.cat((mem, z), dim=0)
+
144        return torch.cat((mem, z), dim=0)
@@ -332,11 +333,11 @@ The convolution layer accepts in the form [batch, features, sequence]
-
145    def forward(self, *,
-146                x: torch.Tensor,
-147                mem: Optional[torch.Tensor],
-148                c_mem: Optional[torch.Tensor],
-149                mask: torch.Tensor):
+
146    def forward(self, *,
+147                x: torch.Tensor,
+148                mem: Optional[torch.Tensor],
+149                c_mem: Optional[torch.Tensor],
+150                mask: torch.Tensor):
@@ -347,7 +348,7 @@ The convolution layer accepts in the form [batch, features, sequence]Normalize the vectors before doing self attention

-
159        z = self.norm_self_attn(x)
+
160        z = self.norm_self_attn(x)
@@ -358,7 +359,7 @@ The convolution layer accepts in the form [batch, features, sequence]Normalize and concatenate memory and compressed memory

-
161        m_z = self.concat_memory(z, mem, c_mem)
+
162        m_z = self.concat_memory(z, mem, c_mem)
@@ -369,7 +370,7 @@ The convolution layer accepts in the form [batch, features, sequence]Attention

-
163        self_attn = self.self_attn(query=z, key=m_z, value=m_z, mask=mask)
+
164        self_attn = self.self_attn(query=z, key=m_z, value=m_z, mask=mask)
@@ -380,7 +381,7 @@ The convolution layer accepts in the form [batch, features, sequence]Add the attention results

-
165        x = x + self.dropout(self_attn)
+
166        x = x + self.dropout(self_attn)
@@ -391,7 +392,7 @@ The convolution layer accepts in the form [batch, features, sequence]Normalize for feed-forward

-
168        z = self.norm_ff(x)
+
169        z = self.norm_ff(x)
@@ -402,7 +403,7 @@ The convolution layer accepts in the form [batch, features, sequence]Pass through the feed-forward network

-
170        ff = self.feed_forward(z)
+
171        ff = self.feed_forward(z)
@@ -413,7 +414,7 @@ The convolution layer accepts in the form [batch, features, sequence]Add the feed-forward results back

-
172        x = x + self.dropout(ff)
+
173        x = x + self.dropout(ff)
@@ -424,7 +425,7 @@ The convolution layer accepts in the form [batch, features, sequence]
-
175        return x
+
176        return x
@@ -436,7 +437,7 @@ The convolution layer accepts in the form [batch, features, sequence]This consists of multiple compressive transformer layers

-
178class CompressiveTransformer(Module):
+
179class CompressiveTransformer(Module):
@@ -447,8 +448,8 @@ The convolution layer accepts in the form [batch, features, sequence]
-
185    def __init__(self, layer: CompressiveTransformerLayer, n_layers: int):
-186        super().__init__()
+
186    def __init__(self, layer: CompressiveTransformerLayer, n_layers: int):
+187        super().__init__()
@@ -459,7 +460,7 @@ The convolution layer accepts in the form [batch, features, sequence]Make copies of the transformer layer

-
188        self.layers = clone_module_list(layer, n_layers)
+
189        self.layers = clone_module_list(layer, n_layers)
@@ -470,7 +471,7 @@ The convolution layer accepts in the form [batch, features, sequence]Final normalization layer

-
190        self.norm = nn.LayerNorm([layer.size])
+
191        self.norm = nn.LayerNorm([layer.size])
@@ -488,7 +489,7 @@ The convolution layer accepts in the form [batch, features, sequence]
-
192    def forward(self, x: torch.Tensor, mem: List[torch.Tensor], c_mem: List[torch.Tensor], mask: torch.Tensor):
+
193    def forward(self, x: torch.Tensor, mem: List[torch.Tensor], c_mem: List[torch.Tensor], mask: torch.Tensor):
@@ -500,7 +501,7 @@ The convolution layer accepts in the form [batch, features, sequence]
-
203        new_mem = []
+
204        new_mem = []
@@ -511,7 +512,7 @@ which will become the memories for the next sequential batch.

Run through each transformer layer

-
205        for i, layer in enumerate(self.layers):
+
206        for i, layer in enumerate(self.layers):
@@ -522,7 +523,7 @@ which will become the memories for the next sequential batch.

Add to the list of feature vectors

-
207            new_mem.append(x.detach())
+
208            new_mem.append(x.detach())
@@ -533,7 +534,7 @@ which will become the memories for the next sequential batch.

Memory

-
209            m = mem[i] if mem else None
+
210            m = mem[i] if mem else None
@@ -544,7 +545,7 @@ which will become the memories for the next sequential batch.

Compressed Memory

-
211            cm = c_mem[i] if c_mem else None
+
212            cm = c_mem[i] if c_mem else None
@@ -555,7 +556,7 @@ which will become the memories for the next sequential batch.

Run through the transformer XL layer

-
213            x = layer(x=x, mem=m, c_mem=cm, mask=mask)
+
214            x = layer(x=x, mem=m, c_mem=cm, mask=mask)
@@ -566,7 +567,7 @@ which will become the memories for the next sequential batch.

Finally, normalize the vectors

-
215        return self.norm(x), new_mem
+
216        return self.norm(x), new_mem
@@ -575,20 +576,20 @@ which will become the memories for the next sequential batch.

#

Attention Reconstruction Loss

-

Attention reconstruction loss recreates the self attention output with -uncompressed memory and with compressed memory and calculate mean squared error +

Attention reconstruction loss recreates the self-attention output with +uncompressed memory and with compressed memory and calculates the mean squared error between the two. It does this without positional encoding.

When calculating and training the compression function $f_c$ with attention -reconstruction loss all parameters but $f_c$ are frozen. -This includes key value projections and bias/scaling after normalization.

+reconstruction loss, all parameters but $f_c$ are frozen. +This includes key/value projections and bias/scaling after normalization.

Since this loss can be computed independently of the cross-entropy-loss of the model you can have a separate optimizer that only updates $f_c$. However, we use the same optimizer to update $f_c$ so when calculating -attention reconstruction loss we detach all other parameters except $f_c$ +attention reconstruction loss, we detach all other parameters except $f_c$ from the gradient computation.

-
218class AttentionReconstructionLoss:
+
219class AttentionReconstructionLoss:
@@ -599,7 +600,7 @@ from the gradient computation.

layers is the list of Compressive Transformer layers

-
236    def __init__(self, layers: TypedModuleList[CompressiveTransformerLayer]):
+
237    def __init__(self, layers: TypedModuleList[CompressiveTransformerLayer]):
@@ -610,8 +611,8 @@ from the gradient computation.

-
240        self.layers = layers
-241        self.loss_func = nn.MSELoss()
+
241        self.layers = layers
+242        self.loss_func = nn.MSELoss()
@@ -627,7 +628,7 @@ where the projections are done with the parameters detached from gradient comput
-
243    def prepare_for_attn(self, pmha: PrepareForMultiHeadAttention, x: torch.Tensor):
+
244    def prepare_for_attn(self, pmha: PrepareForMultiHeadAttention, x: torch.Tensor):
@@ -638,7 +639,7 @@ where the projections are done with the parameters detached from gradient comput

Shape of the input except embedding dimension; [seq_len, batch_size].

-
253        head_shape = x.shape[:-1]
+
254        head_shape = x.shape[:-1]
@@ -649,8 +650,8 @@ where the projections are done with the parameters detached from gradient comput

Detach projection weights and bias

-
256        weight = pmha.linear.weight.detach()
-257        bias = pmha.linear.bias.detach() if pmha.linear.bias is not None else None
+
257        weight = pmha.linear.weight.detach()
+258        bias = pmha.linear.bias.detach() if pmha.linear.bias is not None else None
@@ -661,7 +662,7 @@ where the projections are done with the parameters detached from gradient comput

Linear transform

-
259        x = F.linear(x, weight, bias)
+
260        x = F.linear(x, weight, bias)
@@ -672,7 +673,7 @@ where the projections are done with the parameters detached from gradient comput

Split last dimension into heads

-
262        x = x.view(*head_shape, pmha.heads, pmha.d_k)
+
263        x = x.view(*head_shape, pmha.heads, pmha.d_k)
@@ -683,7 +684,7 @@ where the projections are done with the parameters detached from gradient comput

Output has shape [seq_len, batch_size, heads, d_k] or [batch_size, d_model]

-
265        return x
+
266        return x
@@ -696,7 +697,7 @@ where the projections are done with the parameters detached from gradient comput to detach projection parameters.

-
267    def attn(self, layer: RelativeMultiHeadAttention, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
+
268    def attn(self, layer: RelativeMultiHeadAttention, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor):
@@ -707,9 +708,9 @@ to detach projection parameters.

Calculate query, key and value projections

-
274        query = self.prepare_for_attn(layer.query, query)
-275        key = self.prepare_for_attn(layer.key, key)
-276        value = self.prepare_for_attn(layer.value, value)
+
275        query = self.prepare_for_attn(layer.query, query)
+276        key = self.prepare_for_attn(layer.key, key)
+277        value = self.prepare_for_attn(layer.value, value)
@@ -721,7 +722,7 @@ to detach projection parameters.

This gives a tensor of shape [seq_len, seq_len, batch_size, heads].

-
280        scores = torch.einsum('ibhd,jbhd->ijbh', query, key)
+
281        scores = torch.einsum('ibhd,jbhd->ijbh', query, key)
@@ -732,7 +733,7 @@ This gives a tensor of shape [seq_len, seq_len, batch_size, heads].

Scale scores $\frac{Q K^\top}{\sqrt{d_k}}$

-
283        scores *= layer.scale
+
284        scores *= layer.scale
@@ -744,7 +745,7 @@ This gives a tensor of shape [seq_len, seq_len, batch_size, heads]. $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

-
287        attn = layer.softmax(scores)
+
288        attn = layer.softmax(scores)
@@ -757,7 +758,7 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

-
291        return torch.einsum("ijbh,jbhd->ibhd", attn, value)
+
292        return torch.einsum("ijbh,jbhd->ibhd", attn, value)
@@ -768,7 +769,7 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

Perform layer normalization with shift and scale parameters detached.

-
293    def norm(self, ln: nn.LayerNorm, x: torch.Tensor):
+
294    def norm(self, ln: nn.LayerNorm, x: torch.Tensor):
@@ -779,8 +780,8 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

Detach shift(bias) and scaling(weight) parameters

-
299        weight = ln.weight.detach() if ln.weight is not None else None
-300        bias = ln.bias.detach() if ln.bias is not None else None
+
300        weight = ln.weight.detach() if ln.weight is not None else None
+301        bias = ln.bias.detach() if ln.bias is not None else None
@@ -791,7 +792,7 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

Layer normalization

-
303        return F.layer_norm(x, ln.normalized_shape, weight, bias, ln.eps)
+
304        return F.layer_norm(x, ln.normalized_shape, weight, bias, ln.eps)
@@ -802,7 +803,7 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

This calculates the loss for a layer

-
305    def calc_loss(self, layer: CompressiveTransformerLayer, h: torch.Tensor, mem: torch.Tensor):
+
306    def calc_loss(self, layer: CompressiveTransformerLayer, h: torch.Tensor, mem: torch.Tensor):
@@ -813,8 +814,8 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

Detach the token embeddings and memory.

-
311        h = h.detach()
-312        mem = mem.detach()
+
312        h = h.detach()
+313        mem = mem.detach()
@@ -826,7 +827,7 @@ $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_k}}\Bigg)$

The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient computation.

-
316        c_mem = layer.compress(mem)
+
317        c_mem = layer.compress(mem)
@@ -837,9 +838,9 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient

Normalize the embeddings and memories

-
319        h = self.norm(layer.norm_self_attn, h)
-320        mem = self.norm(layer.norm_self_attn, mem)
-321        c_mem = self.norm(layer.norm_self_attn, c_mem)
+
320        h = self.norm(layer.norm_self_attn, h)
+321        mem = self.norm(layer.norm_self_attn, mem)
+322        c_mem = self.norm(layer.norm_self_attn, c_mem)
@@ -847,10 +848,10 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient -

Calculate attention with uncompressed memory

+

Calculate the attention with uncompressed memory

-
324        attn_mem = self.attn(layer.self_attn, h, mem, mem)
+
325        attn_mem = self.attn(layer.self_attn, h, mem, mem)
@@ -861,7 +862,7 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient

Calculate the attention with compressed memory

-
326        attn_cmem = self.attn(layer.self_attn, h, c_mem, c_mem)
+
327        attn_cmem = self.attn(layer.self_attn, h, c_mem, c_mem)
@@ -872,7 +873,7 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient

Calculate the mean square error

-
329        return self.loss_func(attn_cmem, attn_mem)
+
330        return self.loss_func(attn_cmem, attn_mem)
@@ -883,7 +884,7 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient
-
331    def __call__(self, h: List[torch.Tensor], mem: List[torch.Tensor]):
+
332    def __call__(self, h: List[torch.Tensor], mem: List[torch.Tensor]):
@@ -894,7 +895,7 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient

Calculate the losses for each layer

-
333        losses = [self.calc_loss(layer, h[n], mem[n]) for n, layer in enumerate(self.layers)]
+
334        losses = [self.calc_loss(layer, h[n], mem[n]) for n, layer in enumerate(self.layers)]
@@ -905,7 +906,7 @@ The parameters of $f_c^{(i)}$ are the only parameters not detached from gradient

Sum of the losses

-
335        return sum(losses)
+
336        return sum(losses)
diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py index 89b7b97e..eb72bc74 100644 --- a/labml_nn/transformers/compressive/__init__.py +++ b/labml_nn/transformers/compressive/__init__.py @@ -22,29 +22,30 @@ $n_{cm}$ memories, where $c$ is the compression rate. The compression operation is defined as $f_c: \mathbb{R}^{nc \times d} \rightarrow \mathbb{R}^{n \times d}$. The paper introduces multiple choices for $f_c$ and we have only implemented -1D convolution which seems to give best results. +1D convolution which seems to give the best results. Each layer has a separate compression operation $f_c^{(i)}$ where $i$ is the layer number. ## Training compression operation Since training compression with BPTT requires maintaining -a very large computational graph (many time steps), paper proposes +a very large computational graph (many time steps), the paper proposes an *auto-encoding loss* and an *attention reconstruction loss*. -The auto-encoding loss, decodes the original memories from the compressed memories, -and calculate the loss. +The auto-encoding loss decodes the original memories from the compressed memories +and calculates the loss. Attention reconstruction loss computes the multi-headed attention results -on the compressed memory and on uncompressed memory and get a mean squared error +on the compressed memory and on uncompressed memory and gets a mean squared error between them. We have implemented the latter here since it gives better results. -This implementation uses pre-layer norm while the paper uses post-layer norm. +This implementation uses pre-layer normalization +while the paper uses post-layer normalization. Pre-layer norm does the layer norm before FFN[../feedforward.html) and -self attention, and the pass through in the residual connection is not normalized. +self-attention, and the pass-through in the residual connection is not normalized. This is supposed to be more stable in standard transformer setups. -Here's [the training code](experiment.html) and a notebook for training a compressive transformer -model on Tiny Shakespeare dataset. +Here are [the training code](experiment.html) and a notebook for training a compressive transformer +model on the Tiny Shakespeare dataset. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/transformers/compressive/experiment.ipynb) [![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=0d9b5338726c11ebb7c80242ac1c0002) @@ -219,18 +220,18 @@ class AttentionReconstructionLoss: """ ## Attention Reconstruction Loss - Attention reconstruction loss recreates the self attention output with - uncompressed memory and with compressed memory and calculate mean squared error + Attention reconstruction loss recreates the self-attention output with + uncompressed memory and with compressed memory and calculates the mean squared error between the two. It does this without positional encoding. When calculating and training the compression function $f_c$ with attention - reconstruction loss all parameters but $f_c$ are frozen. - This includes key value projections and bias/scaling after normalization. + reconstruction loss, all parameters but $f_c$ are frozen. + This includes key/value projections and bias/scaling after normalization. Since this loss can be computed independently of the cross-entropy-loss of the model you can have a separate optimizer that only updates $f_c$. However, we use the same optimizer to update $f_c$ so when calculating - attention reconstruction loss we detach all other parameters except $f_c$ + attention reconstruction loss, we detach all other parameters except $f_c$ from the gradient computation. """ def __init__(self, layers: TypedModuleList[CompressiveTransformerLayer]): @@ -320,7 +321,7 @@ class AttentionReconstructionLoss: mem = self.norm(layer.norm_self_attn, mem) c_mem = self.norm(layer.norm_self_attn, c_mem) - # Calculate attention with uncompressed memory + # Calculate the attention with uncompressed memory attn_mem = self.attn(layer.self_attn, h, mem, mem) # Calculate the attention with compressed memory attn_cmem = self.attn(layer.self_attn, h, c_mem, c_mem) diff --git a/labml_nn/transformers/compressive/experiment.py b/labml_nn/transformers/compressive/experiment.py index 71d8ec64..7915b547 100644 --- a/labml_nn/transformers/compressive/experiment.py +++ b/labml_nn/transformers/compressive/experiment.py @@ -54,7 +54,7 @@ class AutoregressiveModel(Module): mem = [] c_mem = [] - # Length of the memory (for masks) + # Total length of the memory and compressed memory (for masks) m_len = len(mem[0]) if mem else 0 if c_mem: m_len += len(c_mem[0]) @@ -88,7 +88,7 @@ class Configs(NLPAutoRegressionConfigs): """ ## Configurations - The default configs can and will be over-ridden when we start the experiment. + The default configurations can and will be overridden when we start the experiment. """ model: AutoregressiveModel @@ -132,7 +132,7 @@ class Configs(NLPAutoRegressionConfigs): Concatenate new memories and compress the oldest memories. """ - # If it's configured not to use memory + # If the configurations specify not to use memory if self.mem_len == 0 and self.c_mem_len == 0: return CompressedMemory([], []), [] @@ -191,7 +191,7 @@ class Configs(NLPAutoRegressionConfigs): mem_to_compress = [] # Return memories and the memories that were compressed. - # Memories that were compressed is needed for the reconstruction loss computation. + # Memories that were compressed are needed for the reconstruction loss computation. return CompressedMemory(mem, c_mem), mem_to_compress def step(self, batch: any, batch_idx: BatchIndex): -- GitLab