batch norm formulas

1eb9fe4d · Varuna Jayasiri · cee8a37a · 1eb9fe4d · 1eb9fe4d · 1eb9fe4d
5 changed file
--- a/docs/normalization/batch_norm/index.html
+++ b/docs/normalization/batch_norm/index.html
--- a/docs/normalization/layer_norm/index.html
+++ b/docs/normalization/layer_norm/index.html
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -99,7 +99,7 @@

    <url>
      <loc>https://nn.labml.ai/normalization/batch_norm/index.html</loc>
-      <lastmod>2021-02-01T16:30:00+00:00</lastmod>
+      <lastmod>2021-02-02T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

--- a/labml_nn/normalization/batch_norm/__init__.py
+++ b/labml_nn/normalization/batch_norm/__init__.py
@@ -2,7 +2,7 @@
 ---
 title: Batch Normalization
 summary: >
- A PyTorch implementations/tutorials of batch normalization.
+ A PyTorch implementation/tutorial of batch normalization.
 ---

 # Batch Normalization
@@ -100,8 +100,28 @@ from torch import nn


 class BatchNorm(nn.Module):
-    """
+    r"""
    ## Batch Normalization Layer
+
+    Batch normalization layer $\text{BN}$ normalizes the input $X$ as follows:
+
+    When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
+    where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width.
+    $$\text{BN}(X) = \gamma
+    \frac{X - \underset{B, H, W}{\mathbb{E}}[X]}{\sqrt{\underset{B, H, W}{Var}[X] + \epsilon}}
+    + \beta$$
+
+    When input $X \in \mathbb{R}^{B \times C}$ is a batch of vector embeddings,
+    where $B$ is the batch size and $C$ is the number of features.
+    $$\text{BN}(X) = \gamma
+    \frac{X - \underset{B}{\mathbb{E}}[X]}{\sqrt{\underset{B}{Var}[X] + \epsilon}}
+    + \beta$$
+
+    When input $X \in \mathbb{R}^{B \times C \times L}$ is a batch of sequence embeddings,
+    where $B$ is the batch size, $C$ is the number of features, and $L$ is the length of the sequence.
+    $$\text{BN}(X) = \gamma
+    \frac{X - \underset{B, L}{\mathbb{E}}[X]}{\sqrt{\underset{B, L}{Var}[X] + \epsilon}}
+    + \beta$$
    """

    def __init__(self, channels: int, *,

--- a/labml_nn/normalization/layer_norm/__init__.py
+++ b/labml_nn/normalization/layer_norm/__init__.py
+"""
+---
+title: Layer Normalization
+summary: >
+ A PyTorch implementation/tutorial of layer normalization.
+---
+
+# Layer Normalization
+
+This is a [PyTorch](https://pytorch.org) implementation of
+[Layer Normalization](https://arxiv.org/abs/1607.06450).
+
+### Limitations of [Batch Normalization](../batch_norm/index.html)
+
+* You need to maintain running means.
+* Tricky for RNNs. Do you need different normalizations for each step?
+* Doesn't work with small batch sizes;
+large NLP models are usually trained with small batch sizes.
+* Need to compute means and variances across devices in distributed training
+
+## Layer Normalization
+
+Layer normalization is a simpler normalization method that works
+on a wider range of settings.
+Layer normalization transformers the inputs to have zero mean and unit variance
+across the features.
+*Note that batch normalization, fixes the zero mean and unit variance for each vector.
+Layer normalization does it for each batch across all elements.
+
+Layer normalization is generally used for NLP tasks.
+
+Here's [the training code](mnist.html) and a notebook for training
+a CNN classifier that use batch normalization for MNIST dataset.
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
+[![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=011254fe647011ebbb8e0242ac1c0002)
+"""
+
+import torch
+from torch import nn
+
+
+class BatchNorm(nn.Module):
+    """
+    ## Batch Normalization Layer
+    """
+
+    def __init__(self, channels: int, *,
+                 eps: float = 1e-5, momentum: float = 0.1,
+                 affine: bool = True, track_running_stats: bool = True):
+        """
+        * `channels` is the number of features in the input
+        * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
+        * `momentum` is the momentum in taking the exponential moving average
+        * `affine` is whether to scale and shift the normalized value
+        * `track_running_stats` is whether to calculate the moving averages or mean and variance
+
+        We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation.
+        """
+        super().__init__()
+
+        self.channels = channels
+
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        # Create parameters for $\gamma$ and $\beta$ for scale and shift
+        if self.affine:
+            self.scale = nn.Parameter(torch.ones(channels))
+            self.shift = nn.Parameter(torch.zeros(channels))
+        # Create buffers to store exponential moving averages of
+        # mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$
+        if self.track_running_stats:
+            self.register_buffer('exp_mean', torch.zeros(channels))
+            self.register_buffer('exp_var', torch.ones(channels))
+
+    def forward(self, x: torch.Tensor):
+        """
+        `x` is a tensor of shape `[batch_size, channels, *]`.
+        `*` could be any (even *) dimensions.
+         For example, in an image (2D) convolution this will be
+        `[batch_size, channels, height, width]`
+        """
+        # Keep the original shape
+        x_shape = x.shape
+        # Get the batch size
+        batch_size = x_shape[0]
+        # Sanity check to make sure the number of features is same
+        assert self.channels == x.shape[1]
+
+        # Reshape into `[batch_size, channels, n]`
+        x = x.view(batch_size, self.channels, -1)
+
+        # We will calculate the mini-batch mean and variance
+        # if we are in training mode or if we have not tracked exponential moving averages
+        if self.training or not self.track_running_stats:
+            # Calculate the mean across first and last dimension;
+            # i.e. the means for each feature $\mathbb{E}[x^{(k)}]$
+            mean = x.mean(dim=[0, 2])
+            # Calculate the squared mean across first and last dimension;
+            # i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$
+            mean_x2 = (x ** 2).mean(dim=[0, 2])
+            # Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$
+            var = mean_x2 - mean ** 2
+
+            # Update exponential moving averages
+            if self.training and self.track_running_stats:
+                self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
+                self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var
+        # Use exponential moving averages as estimates
+        else:
+            mean = self.exp_mean
+            var = self.exp_var
+
+        # Normalize $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}] + \epsilon}}$$
+        x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1)
+        # Scale and shift $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$
+        if self.affine:
+            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
+
+        # Reshape to original and return
+        return x_norm.view(x_shape)