diff --git a/docs/sitemap.xml b/docs/sitemap.xml
index 7579d06e90ae1821d8949f9a760cff3dbbad29dc..4e8679cf7841b5d618115a20c82173f116ae4012 100644
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -204,7 +204,7 @@
https://nn.labml.ai/normalization/batch_norm/mnist.html
- 2021-08-20T16:30:00+00:00
+ 2021-08-21T16:30:00+00:00
1.00
diff --git a/docs/uncertainty/evidence/index.html b/docs/uncertainty/evidence/index.html
index 9cce99edeef49732ec94ff6613a129198d13a84a..de5bd1b421f89c48cc020fb58593e2b21abd2afd 100644
--- a/docs/uncertainty/evidence/index.html
+++ b/docs/uncertainty/evidence/index.html
@@ -85,14 +85,14 @@ and $u = \frac{K}{S}$ where $S = \sum_{k=1}^K (e_k + 1)$.
Paper uses term evidence as a measure of the amount of support
collected from data in favor of a sample to be classified into a certain class.
This corresponds to a Dirichlet distribution
-with parameters $\color{cyan}{\alpha_k} = e_k + 1$, and
- $\color{cyan}{\alpha_0} = S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ is known as the Dirichlet strength.
-Dirichlet distribution $D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}})$
+with parameters $\color{orange}{\alpha_k} = e_k + 1$, and
+ $\color{orange}{\alpha_0} = S = \sum_{k=1}^K \color{orange}{\alpha_k}$ is known as the Dirichlet strength.
+Dirichlet distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$
is a distribution over categorical distribution; i.e. you can sample class probabilities
from a Dirichlet distribution.
-The expected probability for class $k$ is $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$.
+The expected probability for class $k$ is $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$.
We get the model to output evidences
-
+
for a given input $\mathbf{x}$.
We use a function such as
ReLU or a
@@ -116,7 +116,7 @@ We use a function such as
Type II Maximum Likelihood Loss
-The distribution D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}}) is a prior on the likelihood
+
The distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a prior on the likelihood
$Multi(\mathbf{y} \vert p)$,
and the negative log marginal likelihood is calculated by integrating over class probabilities
$\mathbf{p}$.
@@ -127,11 +127,11 @@ $Multi(\mathbf{y} \vert p)$,
&= -\log \Bigg(
\int
\prod_{k=1}^K p_k^{y_k}
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
-&= \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)
+&= \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)
\end{align}
@@ -158,7 +158,7 @@ $Multi(\mathbf{y} \vert p)$,
- $\color{cyan}{\alpha_k} = e_k + 1$
+ $\color{orange}{\alpha_k} = e_k + 1$
@@ -169,7 +169,7 @@ $Multi(\mathbf{y} \vert p)$,
-
$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+
$S = \sum_{k=1}^K \color{orange}{\alpha_k}$
92 strength = alpha.sum(dim=-1)
@@ -180,7 +180,7 @@ $Multi(\mathbf{y} \vert p)$,
-
Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)$
+
Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)$
95 loss = (target * (strength.log()[:, None] - alpha.log())).sum(dim=-1)
@@ -217,11 +217,11 @@ and sums it over all possible outcomes based on probability distribution.
&= -\log \Bigg(
\int
\Big[ \sum_{k=1}^K -y_k \log p_k \Big]
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
-&= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)
+&= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)
\end{align}
where $\psi(\cdot)$ is the $digamma$ function.
@@ -249,7 +249,7 @@ and sums it over all possible outcomes based on probability distribution.
-
$\color{cyan}{\alpha_k} = e_k + 1$
+
$\color{orange}{\alpha_k} = e_k + 1$
136 alpha = evidence + 1.
@@ -260,7 +260,7 @@ and sums it over all possible outcomes based on probability distribution.
-
$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+
$S = \sum_{k=1}^K \color{orange}{\alpha_k}$
138 strength = alpha.sum(dim=-1)
@@ -271,7 +271,7 @@ and sums it over all possible outcomes based on probability distribution.
-
Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)$
+
Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)$
141 loss = (target * (torch.digamma(strength)[:, None] - torch.digamma(alpha))).sum(dim=-1)
@@ -305,19 +305,19 @@ and sums it over all possible outcomes based on probability distribution.
&= -\log \Bigg(
\int
\Big[ \sum_{k=1}^K (y_k - p_k)^2 \Big]
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
&= \sum_{k=1}^K \mathbb{E} \Big[ y_k^2 -2 y_k p_k + p_k^2 \Big] \\
&= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big)
\end{align}
-
Where
+
Where
is the expected probability when sampled from the Dirichlet distribution
and
where
-
is the variance.
This gives,
@@ -355,7 +355,7 @@ the second part is the variance.
-
$\color{cyan}{\alpha_k} = e_k + 1$
+
$\color{orange}{\alpha_k} = e_k + 1$
197 alpha = evidence + 1.
@@ -366,7 +366,7 @@ the second part is the variance.
-
$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+
$S = \sum_{k=1}^K \color{orange}{\alpha_k}$
199 strength = alpha.sum(dim=-1)
@@ -377,7 +377,7 @@ the second part is the variance.
-
$\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$
+
$\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$
201 p = alpha / strength[:, None]
@@ -435,7 +435,7 @@ the second part is the variance.
KL Divergence Regularization Loss
This tries to shrink the total evidence to zero if the sample cannot be correctly classified.
-
First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$ the
+
First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$ the
Dirichlet parameters after remove the correct evidence.
+
@@ -637,7 +637,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$
-
$\color{cyan}{\alpha_k} = e_k + 1$
+
$\color{orange}{\alpha_k} = e_k + 1$
296 alpha = evidence + 1.
@@ -648,7 +648,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$
-
$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+
$S = \sum_{k=1}^K \color{orange}{\alpha_k}$
298 strength = alpha.sum(dim=-1)
@@ -659,7 +659,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$
-
$\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$
+
$\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$
301 expected_probability = alpha / strength[:, None]
diff --git a/labml_nn/uncertainty/evidence/__init__.py b/labml_nn/uncertainty/evidence/__init__.py
index b0a911d45bae6fc84c33c3c16224f3955a9c50f8..c76b360769a41dc5f0269e74365b4c2a46bd7551 100644
--- a/labml_nn/uncertainty/evidence/__init__.py
+++ b/labml_nn/uncertainty/evidence/__init__.py
@@ -29,15 +29,15 @@ Paper uses term evidence as a measure of the amount of support
collected from data in favor of a sample to be classified into a certain class.
This corresponds to a [Dirichlet distribution](https://en.wikipedia.org/wiki/Dirichlet_distribution)
-with parameters $\color{cyan}{\alpha_k} = e_k + 1$, and
- $\color{cyan}{\alpha_0} = S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ is known as the Dirichlet strength.
-Dirichlet distribution $D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}})$
+with parameters $\color{orange}{\alpha_k} = e_k + 1$, and
+ $\color{orange}{\alpha_0} = S = \sum_{k=1}^K \color{orange}{\alpha_k}$ is known as the Dirichlet strength.
+Dirichlet distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$
is a distribution over categorical distribution; i.e. you can sample class probabilities
from a Dirichlet distribution.
-The expected probability for class $k$ is $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$.
+The expected probability for class $k$ is $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$.
We get the model to output evidences
-$$\mathbf{e} = \color{cyan}{\mathbf{\alpha}} - 1 = f(\mathbf{x} | \Theta)$$
+$$\mathbf{e} = \color{orange}{\mathbf{\alpha}} - 1 = f(\mathbf{x} | \Theta)$$
for a given input $\mathbf{x}$.
We use a function such as
[ReLU](https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html) or a
@@ -62,7 +62,7 @@ class MaximumLikelihoodLoss(Module):
## Type II Maximum Likelihood Loss
- The distribution D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}}) is a prior on the likelihood
+ The distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a prior on the likelihood
$Multi(\mathbf{y} \vert p)$,
and the negative log marginal likelihood is calculated by integrating over class probabilities
$\mathbf{p}$.
@@ -74,11 +74,11 @@ class MaximumLikelihoodLoss(Module):
&= -\log \Bigg(
\int
\prod_{k=1}^K p_k^{y_k}
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
- &= \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)
+ &= \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)
\end{align}
"""
def forward(self, evidence: torch.Tensor, target: torch.Tensor):
@@ -86,12 +86,12 @@ class MaximumLikelihoodLoss(Module):
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
- # $\color{cyan}{\alpha_k} = e_k + 1$
+ # $\color{orange}{\alpha_k} = e_k + 1$
alpha = evidence + 1.
- # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+ # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$
strength = alpha.sum(dim=-1)
- # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)$
+ # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)$
loss = (target * (strength.log()[:, None] - alpha.log())).sum(dim=-1)
# Mean loss over the batch
@@ -117,11 +117,11 @@ class CrossEntropyBayesRisk(Module):
&= -\log \Bigg(
\int
\Big[ \sum_{k=1}^K -y_k \log p_k \Big]
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
- &= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)
+ &= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)
\end{align}
where $\psi(\cdot)$ is the $digamma$ function.
@@ -132,12 +132,12 @@ class CrossEntropyBayesRisk(Module):
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
- # $\color{cyan}{\alpha_k} = e_k + 1$
+ # $\color{orange}{\alpha_k} = e_k + 1$
alpha = evidence + 1.
- # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+ # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$
strength = alpha.sum(dim=-1)
- # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)$
+ # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)$
loss = (target * (torch.digamma(strength)[:, None] - torch.digamma(alpha))).sum(dim=-1)
# Mean loss over the batch
@@ -159,19 +159,19 @@ class SquaredErrorBayesRisk(Module):
&= -\log \Bigg(
\int
\Big[ \sum_{k=1}^K (y_k - p_k)^2 \Big]
- \frac{1}{B(\color{cyan}{\mathbf{\alpha}})}
- \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1}
+ \frac{1}{B(\color{orange}{\mathbf{\alpha}})}
+ \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1}
d\mathbf{p}
\Bigg ) \\
&= \sum_{k=1}^K \mathbb{E} \Big[ y_k^2 -2 y_k p_k + p_k^2 \Big] \\
&= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big)
\end{align}
- Where $$\mathbb{E}[p_k] = \hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$$
+ Where $$\mathbb{E}[p_k] = \hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$$
is the expected probability when sampled from the Dirichlet distribution
and $$\mathbb{E}[p_k^2] = \mathbb{E}[p_k]^2 + \text{Var}(p_k)$$
where
- $$\text{Var}(p_k) = \frac{\color{cyan}{\alpha_k}(S - \color{cyan}{\alpha_k})}{S^2 (S + 1)}
+ $$\text{Var}(p_k) = \frac{\color{orange}{\alpha_k}(S - \color{orange}{\alpha_k})}{S^2 (S + 1)}
= \frac{\hat{p}_k(1 - \hat{p}_k)}{S + 1}$$
is the variance.
@@ -193,11 +193,11 @@ class SquaredErrorBayesRisk(Module):
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
- # $\color{cyan}{\alpha_k} = e_k + 1$
+ # $\color{orange}{\alpha_k} = e_k + 1$
alpha = evidence + 1.
- # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+ # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$
strength = alpha.sum(dim=-1)
- # $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$
+ # $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$
p = alpha / strength[:, None]
# Error $(y_k -\hat{p}_k)^2$
@@ -219,7 +219,7 @@ class KLDivergenceLoss(Module):
This tries to shrink the total evidence to zero if the sample cannot be correctly classified.
- First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$ the
+ First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$ the
Dirichlet parameters after remove the correct evidence.
\begin{align}
@@ -240,12 +240,12 @@ class KLDivergenceLoss(Module):
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
- # $\color{cyan}{\alpha_k} = e_k + 1$
+ # $\color{orange}{\alpha_k} = e_k + 1$
alpha = evidence + 1.
# Number of classes
n_classes = evidence.shape[-1]
# Remove non-misleading evidence
- # $$\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$$
+ # $$\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$$
alpha_tilde = target + (1 - target) * alpha
# $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$
strength_tilde = alpha_tilde.sum(dim=-1)
@@ -292,12 +292,12 @@ class TrackStatistics(Module):
# Track accuracy
tracker.add('accuracy.', match.sum() / match.shape[0])
- # $\color{cyan}{\alpha_k} = e_k + 1$
+ # $\color{orange}{\alpha_k} = e_k + 1$
alpha = evidence + 1.
- # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$
+ # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$
strength = alpha.sum(dim=-1)
- # $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$
+ # $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$
expected_probability = alpha / strength[:, None]
# Expected probability of the selected (greedy highset probability) class
expected_probability, _ = expected_probability.max(dim=-1)