From 1eb9fe4dd6936f20dc2ae79a0f249c687f2be2ba Mon Sep 17 00:00:00 2001 From: Varuna Jayasiri Date: Tue, 2 Feb 2021 10:07:06 +0530 Subject: [PATCH] batch norm formulas --- docs/normalization/batch_norm/index.html | 97 +++-- docs/normalization/layer_norm/index.html | 380 ++++++++++++++++++ docs/sitemap.xml | 2 +- labml_nn/normalization/batch_norm/__init__.py | 24 +- labml_nn/normalization/layer_norm/__init__.py | 123 ++++++ 5 files changed, 584 insertions(+), 42 deletions(-) create mode 100644 docs/normalization/layer_norm/index.html create mode 100644 labml_nn/normalization/layer_norm/__init__.py diff --git a/docs/normalization/batch_norm/index.html b/docs/normalization/batch_norm/index.html index 424cb75a..4dc6a4bf 100644 --- a/docs/normalization/batch_norm/index.html +++ b/docs/normalization/batch_norm/index.html @@ -3,12 +3,12 @@ - + - + @@ -18,7 +18,7 @@ - + Batch Normalization @@ -151,6 +151,25 @@ a CNN classifier that use batch normalization for MNIST dataset.

#

Batch Normalization Layer

+

Batch normalization layer $\text{BN}$ normalizes the input $X$ as follows:

+

When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations, +where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width. + +

+

When input $X \in \mathbb{R}^{B \times C}$ is a batch of vector embeddings, +where $B$ is the batch size and $C$ is the number of features. + +

+

When input $X \in \mathbb{R}^{B \times C \times L}$ is a batch of sequence embeddings, +where $B$ is the batch size, $C$ is the number of features, and $L$ is the length of the sequence. + +

102class BatchNorm(nn.Module):
@@ -171,9 +190,9 @@ a CNN classifier that use batch normalization for MNIST dataset.

We’ve tried to use the same names for arguments as PyTorch BatchNorm implementation.

-
107    def __init__(self, channels: int, *,
-108                 eps: float = 1e-5, momentum: float = 0.1,
-109                 affine: bool = True, track_running_stats: bool = True):
+
127    def __init__(self, channels: int, *,
+128                 eps: float = 1e-5, momentum: float = 0.1,
+129                 affine: bool = True, track_running_stats: bool = True):
@@ -184,14 +203,14 @@ a CNN classifier that use batch normalization for MNIST dataset.

-
119        super().__init__()
-120
-121        self.channels = channels
-122
-123        self.eps = eps
-124        self.momentum = momentum
-125        self.affine = affine
-126        self.track_running_stats = track_running_stats
+
139        super().__init__()
+140
+141        self.channels = channels
+142
+143        self.eps = eps
+144        self.momentum = momentum
+145        self.affine = affine
+146        self.track_running_stats = track_running_stats
@@ -202,9 +221,9 @@ a CNN classifier that use batch normalization for MNIST dataset.

Create parameters for $\gamma$ and $\beta$ for scale and shift

-
128        if self.affine:
-129            self.scale = nn.Parameter(torch.ones(channels))
-130            self.shift = nn.Parameter(torch.zeros(channels))
+
148        if self.affine:
+149            self.scale = nn.Parameter(torch.ones(channels))
+150            self.shift = nn.Parameter(torch.zeros(channels))
@@ -216,9 +235,9 @@ a CNN classifier that use batch normalization for MNIST dataset.

mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

-
133        if self.track_running_stats:
-134            self.register_buffer('exp_mean', torch.zeros(channels))
-135            self.register_buffer('exp_var', torch.ones(channels))
+
153        if self.track_running_stats:
+154            self.register_buffer('exp_mean', torch.zeros(channels))
+155            self.register_buffer('exp_var', torch.ones(channels))
@@ -232,7 +251,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

[batch_size, channels, height, width]

-
137    def forward(self, x: torch.Tensor):
+
157    def forward(self, x: torch.Tensor):
@@ -243,7 +262,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

Keep the original shape

-
145        x_shape = x.shape
+
165        x_shape = x.shape
@@ -254,7 +273,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

Get the batch size

-
147        batch_size = x_shape[0]
+
167        batch_size = x_shape[0]
@@ -265,7 +284,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

Sanity check to make sure the number of features is same

-
149        assert self.channels == x.shape[1]
+
169        assert self.channels == x.shape[1]
@@ -276,7 +295,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

Reshape into [batch_size, channels, n]

-
152        x = x.view(batch_size, self.channels, -1)
+
172        x = x.view(batch_size, self.channels, -1)
@@ -288,7 +307,7 @@ mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

if we are in training mode or if we have not tracked exponential moving averages

-
156        if self.training or not self.track_running_stats:
+
176        if self.training or not self.track_running_stats:
@@ -300,7 +319,7 @@ if we are in training mode or if we have not tracked exponential moving averages i.e. the means for each feature $\mathbb{E}[x^{(k)}]$

-
159            mean = x.mean(dim=[0, 2])
+
179            mean = x.mean(dim=[0, 2])
@@ -312,7 +331,7 @@ i.e. the means for each feature $\mathbb{E}[x^{(k)}]$

i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

-
162            mean_x2 = (x ** 2).mean(dim=[0, 2])
+
182            mean_x2 = (x ** 2).mean(dim=[0, 2])
@@ -323,7 +342,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$

-
164            var = mean_x2 - mean ** 2
+
184            var = mean_x2 - mean ** 2
@@ -334,9 +353,9 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

Update exponential moving averages

-
167            if self.training and self.track_running_stats:
-168                self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
-169                self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var
+
187            if self.training and self.track_running_stats:
+188                self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
+189                self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var
@@ -347,9 +366,9 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

Use exponential moving averages as estimates

-
171        else:
-172            mean = self.exp_mean
-173            var = self.exp_var
+
191        else:
+192            mean = self.exp_mean
+193            var = self.exp_var
@@ -361,7 +380,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

-
176        x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1)
+
196        x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1)
@@ -373,8 +392,8 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

-
178        if self.affine:
-179            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
+
198        if self.affine:
+199            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
@@ -385,7 +404,7 @@ i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

Reshape to original and return

-
182        return x_norm.view(x_shape)
+
202        return x_norm.view(x_shape)
diff --git a/docs/normalization/layer_norm/index.html b/docs/normalization/layer_norm/index.html new file mode 100644 index 00000000..ff2e5fdd --- /dev/null +++ b/docs/normalization/layer_norm/index.html @@ -0,0 +1,380 @@ + + + + + + + + + + + + + + + + + + + + + + + Layer Normalization + + + + + + + + +
+
+
+
+

+ home + normalization + layer_norm +

+

+ + + Github + + Join Slact + + Twitter +

+
+
+
+
+ +

Layer Normalization

+

This is a PyTorch implementation of +Layer Normalization.

+

Limitations of Batch Normalization

+
    +
  • You need to maintain running means.
  • +
  • Tricky for RNNs. Do you need different normalizations for each step?
  • +
  • Doesn’t work with small batch sizes; +large NLP models are usually trained with small batch sizes.
  • +
  • Need to compute means and variances across devices in distributed training
  • +
+

Layer Normalization

+

Layer normalization is a simpler normalization method that works +on a wider range of settings. +Layer normalization transformers the inputs to have zero mean and unit variance +across the features. +*Note that batch normalization, fixes the zero mean and unit variance for each vector. +Layer normalization does it for each batch across all elements.

+

Layer normalization is generally used for NLP tasks.

+

Here’s the training code and a notebook for training +a CNN classifier that use batch normalization for MNIST dataset.

+

Open In Colab +View Run

+
+
+
39import torch
+40from torch import nn
+
+
+
+
+ +

Batch Normalization Layer

+
+
+
43class BatchNorm(nn.Module):
+
+
+
+
+ +
    +
  • channels is the number of features in the input
  • +
  • eps is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
  • +
  • momentum is the momentum in taking the exponential moving average
  • +
  • affine is whether to scale and shift the normalized value
  • +
  • track_running_stats is whether to calculate the moving averages or mean and variance
  • +
+

We’ve tried to use the same names for arguments as PyTorch BatchNorm implementation.

+
+
+
48    def __init__(self, channels: int, *,
+49                 eps: float = 1e-5, momentum: float = 0.1,
+50                 affine: bool = True, track_running_stats: bool = True):
+
+
+
+
+ + +
+
+
60        super().__init__()
+61
+62        self.channels = channels
+63
+64        self.eps = eps
+65        self.momentum = momentum
+66        self.affine = affine
+67        self.track_running_stats = track_running_stats
+
+
+
+
+ +

Create parameters for $\gamma$ and $\beta$ for scale and shift

+
+
+
69        if self.affine:
+70            self.scale = nn.Parameter(torch.ones(channels))
+71            self.shift = nn.Parameter(torch.zeros(channels))
+
+
+
+
+ +

Create buffers to store exponential moving averages of +mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$

+
+
+
74        if self.track_running_stats:
+75            self.register_buffer('exp_mean', torch.zeros(channels))
+76            self.register_buffer('exp_var', torch.ones(channels))
+
+
+
+
+ +

x is a tensor of shape [batch_size, channels, *]. +* could be any (even *) dimensions. + For example, in an image (2D) convolution this will be +[batch_size, channels, height, width]

+
+
+
78    def forward(self, x: torch.Tensor):
+
+
+
+
+ +

Keep the original shape

+
+
+
86        x_shape = x.shape
+
+
+
+
+ +

Get the batch size

+
+
+
88        batch_size = x_shape[0]
+
+
+
+
+ +

Sanity check to make sure the number of features is same

+
+
+
90        assert self.channels == x.shape[1]
+
+
+
+
+ +

Reshape into [batch_size, channels, n]

+
+
+
93        x = x.view(batch_size, self.channels, -1)
+
+
+
+
+ +

We will calculate the mini-batch mean and variance +if we are in training mode or if we have not tracked exponential moving averages

+
+
+
97        if self.training or not self.track_running_stats:
+
+
+
+
+ +

Calculate the mean across first and last dimension; +i.e. the means for each feature $\mathbb{E}[x^{(k)}]$

+
+
+
100            mean = x.mean(dim=[0, 2])
+
+
+
+
+ +

Calculate the squared mean across first and last dimension; +i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$

+
+
+
103            mean_x2 = (x ** 2).mean(dim=[0, 2])
+
+
+
+
+ +

Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$

+
+
+
105            var = mean_x2 - mean ** 2
+
+
+
+
+ +

Update exponential moving averages

+
+
+
108            if self.training and self.track_running_stats:
+109                self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
+110                self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var
+
+
+
+
+ +

Use exponential moving averages as estimates

+
+
+
112        else:
+113            mean = self.exp_mean
+114            var = self.exp_var
+
+
+
+
+ +

Normalize +

+
+
+
117        x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1)
+
+
+
+
+ +

Scale and shift +

+
+
+
119        if self.affine:
+120            x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
+
+
+
+
+ +

Reshape to original and return

+
+
+
123        return x_norm.view(x_shape)
+
+
+
+ + + + + + \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 4313ee0f..048381da 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -99,7 +99,7 @@ https://nn.labml.ai/normalization/batch_norm/index.html - 2021-02-01T16:30:00+00:00 + 2021-02-02T16:30:00+00:00 1.00 diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py index c03c5fce..5645fc38 100644 --- a/labml_nn/normalization/batch_norm/__init__.py +++ b/labml_nn/normalization/batch_norm/__init__.py @@ -2,7 +2,7 @@ --- title: Batch Normalization summary: > - A PyTorch implementations/tutorials of batch normalization. + A PyTorch implementation/tutorial of batch normalization. --- # Batch Normalization @@ -100,8 +100,28 @@ from torch import nn class BatchNorm(nn.Module): - """ + r""" ## Batch Normalization Layer + + Batch normalization layer $\text{BN}$ normalizes the input $X$ as follows: + + When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations, + where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width. + $$\text{BN}(X) = \gamma + \frac{X - \underset{B, H, W}{\mathbb{E}}[X]}{\sqrt{\underset{B, H, W}{Var}[X] + \epsilon}} + + \beta$$ + + When input $X \in \mathbb{R}^{B \times C}$ is a batch of vector embeddings, + where $B$ is the batch size and $C$ is the number of features. + $$\text{BN}(X) = \gamma + \frac{X - \underset{B}{\mathbb{E}}[X]}{\sqrt{\underset{B}{Var}[X] + \epsilon}} + + \beta$$ + + When input $X \in \mathbb{R}^{B \times C \times L}$ is a batch of sequence embeddings, + where $B$ is the batch size, $C$ is the number of features, and $L$ is the length of the sequence. + $$\text{BN}(X) = \gamma + \frac{X - \underset{B, L}{\mathbb{E}}[X]}{\sqrt{\underset{B, L}{Var}[X] + \epsilon}} + + \beta$$ """ def __init__(self, channels: int, *, diff --git a/labml_nn/normalization/layer_norm/__init__.py b/labml_nn/normalization/layer_norm/__init__.py new file mode 100644 index 00000000..039669ab --- /dev/null +++ b/labml_nn/normalization/layer_norm/__init__.py @@ -0,0 +1,123 @@ +""" +--- +title: Layer Normalization +summary: > + A PyTorch implementation/tutorial of layer normalization. +--- + +# Layer Normalization + +This is a [PyTorch](https://pytorch.org) implementation of +[Layer Normalization](https://arxiv.org/abs/1607.06450). + +### Limitations of [Batch Normalization](../batch_norm/index.html) + +* You need to maintain running means. +* Tricky for RNNs. Do you need different normalizations for each step? +* Doesn't work with small batch sizes; +large NLP models are usually trained with small batch sizes. +* Need to compute means and variances across devices in distributed training + +## Layer Normalization + +Layer normalization is a simpler normalization method that works +on a wider range of settings. +Layer normalization transformers the inputs to have zero mean and unit variance +across the features. +*Note that batch normalization, fixes the zero mean and unit variance for each vector. +Layer normalization does it for each batch across all elements. + +Layer normalization is generally used for NLP tasks. + +Here's [the training code](mnist.html) and a notebook for training +a CNN classifier that use batch normalization for MNIST dataset. + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb) +[![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=011254fe647011ebbb8e0242ac1c0002) +""" + +import torch +from torch import nn + + +class BatchNorm(nn.Module): + """ + ## Batch Normalization Layer + """ + + def __init__(self, channels: int, *, + eps: float = 1e-5, momentum: float = 0.1, + affine: bool = True, track_running_stats: bool = True): + """ + * `channels` is the number of features in the input + * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability + * `momentum` is the momentum in taking the exponential moving average + * `affine` is whether to scale and shift the normalized value + * `track_running_stats` is whether to calculate the moving averages or mean and variance + + We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation. + """ + super().__init__() + + self.channels = channels + + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + # Create parameters for $\gamma$ and $\beta$ for scale and shift + if self.affine: + self.scale = nn.Parameter(torch.ones(channels)) + self.shift = nn.Parameter(torch.zeros(channels)) + # Create buffers to store exponential moving averages of + # mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$ + if self.track_running_stats: + self.register_buffer('exp_mean', torch.zeros(channels)) + self.register_buffer('exp_var', torch.ones(channels)) + + def forward(self, x: torch.Tensor): + """ + `x` is a tensor of shape `[batch_size, channels, *]`. + `*` could be any (even *) dimensions. + For example, in an image (2D) convolution this will be + `[batch_size, channels, height, width]` + """ + # Keep the original shape + x_shape = x.shape + # Get the batch size + batch_size = x_shape[0] + # Sanity check to make sure the number of features is same + assert self.channels == x.shape[1] + + # Reshape into `[batch_size, channels, n]` + x = x.view(batch_size, self.channels, -1) + + # We will calculate the mini-batch mean and variance + # if we are in training mode or if we have not tracked exponential moving averages + if self.training or not self.track_running_stats: + # Calculate the mean across first and last dimension; + # i.e. the means for each feature $\mathbb{E}[x^{(k)}]$ + mean = x.mean(dim=[0, 2]) + # Calculate the squared mean across first and last dimension; + # i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$ + mean_x2 = (x ** 2).mean(dim=[0, 2]) + # Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$ + var = mean_x2 - mean ** 2 + + # Update exponential moving averages + if self.training and self.track_running_stats: + self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean + self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var + # Use exponential moving averages as estimates + else: + mean = self.exp_mean + var = self.exp_var + + # Normalize $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}] + \epsilon}}$$ + x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1) + # Scale and shift $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$ + if self.affine: + x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1) + + # Reshape to original and return + return x_norm.view(x_shape) -- GitLab