提交 1eb9fe4d 编写于 作者: V Varuna Jayasiri

batch norm formulas

上级 cee8a37a
此差异已折叠。
......@@ -99,7 +99,7 @@
<url>
<loc>https://nn.labml.ai/normalization/batch_norm/index.html</loc>
<lastmod>2021-02-01T16:30:00+00:00</lastmod>
<lastmod>2021-02-02T16:30:00+00:00</lastmod>
<priority>1.00</priority>
</url>
......
......@@ -2,7 +2,7 @@
---
title: Batch Normalization
summary: >
A PyTorch implementations/tutorials of batch normalization.
A PyTorch implementation/tutorial of batch normalization.
---
# Batch Normalization
......@@ -100,8 +100,28 @@ from torch import nn
class BatchNorm(nn.Module):
"""
r"""
## Batch Normalization Layer
Batch normalization layer $\text{BN}$ normalizes the input $X$ as follows:
When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
where $B$ is the batch size, $C$ is the number of channels, $H$ is the height and $W$ is the width.
$$\text{BN}(X) = \gamma
\frac{X - \underset{B, H, W}{\mathbb{E}}[X]}{\sqrt{\underset{B, H, W}{Var}[X] + \epsilon}}
+ \beta$$
When input $X \in \mathbb{R}^{B \times C}$ is a batch of vector embeddings,
where $B$ is the batch size and $C$ is the number of features.
$$\text{BN}(X) = \gamma
\frac{X - \underset{B}{\mathbb{E}}[X]}{\sqrt{\underset{B}{Var}[X] + \epsilon}}
+ \beta$$
When input $X \in \mathbb{R}^{B \times C \times L}$ is a batch of sequence embeddings,
where $B$ is the batch size, $C$ is the number of features, and $L$ is the length of the sequence.
$$\text{BN}(X) = \gamma
\frac{X - \underset{B, L}{\mathbb{E}}[X]}{\sqrt{\underset{B, L}{Var}[X] + \epsilon}}
+ \beta$$
"""
def __init__(self, channels: int, *,
......
"""
---
title: Layer Normalization
summary: >
A PyTorch implementation/tutorial of layer normalization.
---
# Layer Normalization
This is a [PyTorch](https://pytorch.org) implementation of
[Layer Normalization](https://arxiv.org/abs/1607.06450).
### Limitations of [Batch Normalization](../batch_norm/index.html)
* You need to maintain running means.
* Tricky for RNNs. Do you need different normalizations for each step?
* Doesn't work with small batch sizes;
large NLP models are usually trained with small batch sizes.
* Need to compute means and variances across devices in distributed training
## Layer Normalization
Layer normalization is a simpler normalization method that works
on a wider range of settings.
Layer normalization transformers the inputs to have zero mean and unit variance
across the features.
*Note that batch normalization, fixes the zero mean and unit variance for each vector.
Layer normalization does it for each batch across all elements.
Layer normalization is generally used for NLP tasks.
Here's [the training code](mnist.html) and a notebook for training
a CNN classifier that use batch normalization for MNIST dataset.
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lab-ml/nn/blob/master/labml_nn/normalization/batch_norm/mnist.ipynb)
[![View Run](https://img.shields.io/badge/labml-experiment-brightgreen)](https://web.lab-ml.com/run?uuid=011254fe647011ebbb8e0242ac1c0002)
"""
import torch
from torch import nn
class BatchNorm(nn.Module):
"""
## Batch Normalization Layer
"""
def __init__(self, channels: int, *,
eps: float = 1e-5, momentum: float = 0.1,
affine: bool = True, track_running_stats: bool = True):
"""
* `channels` is the number of features in the input
* `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
* `momentum` is the momentum in taking the exponential moving average
* `affine` is whether to scale and shift the normalized value
* `track_running_stats` is whether to calculate the moving averages or mean and variance
We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation.
"""
super().__init__()
self.channels = channels
self.eps = eps
self.momentum = momentum
self.affine = affine
self.track_running_stats = track_running_stats
# Create parameters for $\gamma$ and $\beta$ for scale and shift
if self.affine:
self.scale = nn.Parameter(torch.ones(channels))
self.shift = nn.Parameter(torch.zeros(channels))
# Create buffers to store exponential moving averages of
# mean $\mathbb{E}[x^{(k)}]$ and variance $Var[x^{(k)}]$
if self.track_running_stats:
self.register_buffer('exp_mean', torch.zeros(channels))
self.register_buffer('exp_var', torch.ones(channels))
def forward(self, x: torch.Tensor):
"""
`x` is a tensor of shape `[batch_size, channels, *]`.
`*` could be any (even *) dimensions.
For example, in an image (2D) convolution this will be
`[batch_size, channels, height, width]`
"""
# Keep the original shape
x_shape = x.shape
# Get the batch size
batch_size = x_shape[0]
# Sanity check to make sure the number of features is same
assert self.channels == x.shape[1]
# Reshape into `[batch_size, channels, n]`
x = x.view(batch_size, self.channels, -1)
# We will calculate the mini-batch mean and variance
# if we are in training mode or if we have not tracked exponential moving averages
if self.training or not self.track_running_stats:
# Calculate the mean across first and last dimension;
# i.e. the means for each feature $\mathbb{E}[x^{(k)}]$
mean = x.mean(dim=[0, 2])
# Calculate the squared mean across first and last dimension;
# i.e. the means for each feature $\mathbb{E}[(x^{(k)})^2]$
mean_x2 = (x ** 2).mean(dim=[0, 2])
# Variance for each feature $Var[x^{(k)}] = \mathbb{E}[(x^{(k)})^2] - \mathbb{E}[x^{(k)}]^2$
var = mean_x2 - mean ** 2
# Update exponential moving averages
if self.training and self.track_running_stats:
self.exp_mean = (1 - self.momentum) * self.exp_mean + self.momentum * mean
self.exp_var = (1 - self.momentum) * self.exp_var + self.momentum * var
# Use exponential moving averages as estimates
else:
mean = self.exp_mean
var = self.exp_var
# Normalize $$\hat{x}^{(k)} = \frac{x^{(k)} - \mathbb{E}[x^{(k)}]}{\sqrt{Var[x^{(k)}] + \epsilon}}$$
x_norm = (x - mean.view(1, -1, 1)) / torch.sqrt(var + self.eps).view(1, -1, 1)
# Scale and shift $$y^{(k)} =\gamma^{(k)} \hat{x}^{(k)} + \beta^{(k)}$$
if self.affine:
x_norm = self.scale.view(1, -1, 1) * x_norm + self.shift.view(1, -1, 1)
# Reshape to original and return
return x_norm.view(x_shape)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册