diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 7579d06e90ae1821d8949f9a760cff3dbbad29dc..4e8679cf7841b5d618115a20c82173f116ae4012 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -204,7 +204,7 @@ https://nn.labml.ai/normalization/batch_norm/mnist.html - 2021-08-20T16:30:00+00:00 + 2021-08-21T16:30:00+00:00 1.00 diff --git a/docs/uncertainty/evidence/index.html b/docs/uncertainty/evidence/index.html index 9cce99edeef49732ec94ff6613a129198d13a84a..de5bd1b421f89c48cc020fb58593e2b21abd2afd 100644 --- a/docs/uncertainty/evidence/index.html +++ b/docs/uncertainty/evidence/index.html @@ -85,14 +85,14 @@ and $u = \frac{K}{S}$ where $S = \sum_{k=1}^K (e_k + 1)$. Paper uses term evidence as a measure of the amount of support collected from data in favor of a sample to be classified into a certain class.

This corresponds to a Dirichlet distribution -with parameters $\color{cyan}{\alpha_k} = e_k + 1$, and - $\color{cyan}{\alpha_0} = S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ is known as the Dirichlet strength. -Dirichlet distribution $D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}})$ +with parameters $\color{orange}{\alpha_k} = e_k + 1$, and + $\color{orange}{\alpha_0} = S = \sum_{k=1}^K \color{orange}{\alpha_k}$ is known as the Dirichlet strength. +Dirichlet distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a distribution over categorical distribution; i.e. you can sample class probabilities from a Dirichlet distribution. -The expected probability for class $k$ is $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$.

+The expected probability for class $k$ is $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$.

We get the model to output evidences - + for a given input $\mathbf{x}$. We use a function such as ReLU or a @@ -116,7 +116,7 @@ We use a function such as

Type II Maximum Likelihood Loss

-

The distribution D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}}) is a prior on the likelihood +

The distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a prior on the likelihood $Multi(\mathbf{y} \vert p)$, and the negative log marginal likelihood is calculated by integrating over class probabilities $\mathbf{p}$.

@@ -127,11 +127,11 @@ $Multi(\mathbf{y} \vert p)$, &= -\log \Bigg( \int \prod_{k=1}^K p_k^{y_k} - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ -&= \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg) +&= \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg) \end{align}

@@ -158,7 +158,7 @@ $Multi(\mathbf{y} \vert p)$, -

$\color{cyan}{\alpha_k} = e_k + 1$

+

$\color{orange}{\alpha_k} = e_k + 1$

90        alpha = evidence + 1.
@@ -169,7 +169,7 @@ $Multi(\mathbf{y} \vert p)$, -

$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$

+

$S = \sum_{k=1}^K \color{orange}{\alpha_k}$

92        strength = alpha.sum(dim=-1)
@@ -180,7 +180,7 @@ $Multi(\mathbf{y} \vert p)$, -

Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)$

+

Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)$

95        loss = (target * (strength.log()[:, None] - alpha.log())).sum(dim=-1)
@@ -217,11 +217,11 @@ and sums it over all possible outcomes based on probability distribution.

&= -\log \Bigg( \int \Big[ \sum_{k=1}^K -y_k \log p_k \Big] - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ -&= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg) +&= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg) \end{align}

where $\psi(\cdot)$ is the $digamma$ function.

@@ -249,7 +249,7 @@ and sums it over all possible outcomes based on probability distribution.

-

$\color{cyan}{\alpha_k} = e_k + 1$

+

$\color{orange}{\alpha_k} = e_k + 1$

136        alpha = evidence + 1.
@@ -260,7 +260,7 @@ and sums it over all possible outcomes based on probability distribution.

-

$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$

+

$S = \sum_{k=1}^K \color{orange}{\alpha_k}$

138        strength = alpha.sum(dim=-1)
@@ -271,7 +271,7 @@ and sums it over all possible outcomes based on probability distribution.

-

Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)$

+

Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)$

141        loss = (target * (torch.digamma(strength)[:, None] - torch.digamma(alpha))).sum(dim=-1)
@@ -305,19 +305,19 @@ and sums it over all possible outcomes based on probability distribution.

&= -\log \Bigg( \int \Big[ \sum_{k=1}^K (y_k - p_k)^2 \Big] - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ &= \sum_{k=1}^K \mathbb{E} \Big[ y_k^2 -2 y_k p_k + p_k^2 \Big] \\ &= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big) \end{align}

-

Where +

Where is the expected probability when sampled from the Dirichlet distribution and where - is the variance.

This gives, @@ -355,7 +355,7 @@ the second part is the variance.

-

$\color{cyan}{\alpha_k} = e_k + 1$

+

$\color{orange}{\alpha_k} = e_k + 1$

197        alpha = evidence + 1.
@@ -366,7 +366,7 @@ the second part is the variance.

-

$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$

+

$S = \sum_{k=1}^K \color{orange}{\alpha_k}$

199        strength = alpha.sum(dim=-1)
@@ -377,7 +377,7 @@ the second part is the variance.

-

$\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$

+

$\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$

201        p = alpha / strength[:, None]
@@ -435,7 +435,7 @@ the second part is the variance.

KL Divergence Regularization Loss

This tries to shrink the total evidence to zero if the sample cannot be correctly classified.

-

First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$ the +

First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$ the Dirichlet parameters after remove the correct evidence.

+

@@ -637,7 +637,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$

-

$\color{cyan}{\alpha_k} = e_k + 1$

+

$\color{orange}{\alpha_k} = e_k + 1$

296        alpha = evidence + 1.
@@ -648,7 +648,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$

-

$S = \sum_{k=1}^K \color{cyan}{\alpha_k}$

+

$S = \sum_{k=1}^K \color{orange}{\alpha_k}$

298        strength = alpha.sum(dim=-1)
@@ -659,7 +659,7 @@ $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$

-

$\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$

+

$\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$

301        expected_probability = alpha / strength[:, None]
diff --git a/labml_nn/uncertainty/evidence/__init__.py b/labml_nn/uncertainty/evidence/__init__.py index b0a911d45bae6fc84c33c3c16224f3955a9c50f8..c76b360769a41dc5f0269e74365b4c2a46bd7551 100644 --- a/labml_nn/uncertainty/evidence/__init__.py +++ b/labml_nn/uncertainty/evidence/__init__.py @@ -29,15 +29,15 @@ Paper uses term evidence as a measure of the amount of support collected from data in favor of a sample to be classified into a certain class. This corresponds to a [Dirichlet distribution](https://en.wikipedia.org/wiki/Dirichlet_distribution) -with parameters $\color{cyan}{\alpha_k} = e_k + 1$, and - $\color{cyan}{\alpha_0} = S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ is known as the Dirichlet strength. -Dirichlet distribution $D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}})$ +with parameters $\color{orange}{\alpha_k} = e_k + 1$, and + $\color{orange}{\alpha_0} = S = \sum_{k=1}^K \color{orange}{\alpha_k}$ is known as the Dirichlet strength. +Dirichlet distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a distribution over categorical distribution; i.e. you can sample class probabilities from a Dirichlet distribution. -The expected probability for class $k$ is $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$. +The expected probability for class $k$ is $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$. We get the model to output evidences -$$\mathbf{e} = \color{cyan}{\mathbf{\alpha}} - 1 = f(\mathbf{x} | \Theta)$$ +$$\mathbf{e} = \color{orange}{\mathbf{\alpha}} - 1 = f(\mathbf{x} | \Theta)$$ for a given input $\mathbf{x}$. We use a function such as [ReLU](https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html) or a @@ -62,7 +62,7 @@ class MaximumLikelihoodLoss(Module): ## Type II Maximum Likelihood Loss - The distribution D(\mathbf{p} \vert \color{cyan}{\mathbf{\alpha}}) is a prior on the likelihood + The distribution $D(\mathbf{p} \vert \color{orange}{\mathbf{\alpha}})$ is a prior on the likelihood $Multi(\mathbf{y} \vert p)$, and the negative log marginal likelihood is calculated by integrating over class probabilities $\mathbf{p}$. @@ -74,11 +74,11 @@ class MaximumLikelihoodLoss(Module): &= -\log \Bigg( \int \prod_{k=1}^K p_k^{y_k} - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ - &= \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg) + &= \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg) \end{align} """ def forward(self, evidence: torch.Tensor, target: torch.Tensor): @@ -86,12 +86,12 @@ class MaximumLikelihoodLoss(Module): * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ - # $\color{cyan}{\alpha_k} = e_k + 1$ + # $\color{orange}{\alpha_k} = e_k + 1$ alpha = evidence + 1. - # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ + # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$ strength = alpha.sum(dim=-1) - # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{cyan}{\alpha_k} \bigg)$ + # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \log S - \log \color{orange}{\alpha_k} \bigg)$ loss = (target * (strength.log()[:, None] - alpha.log())).sum(dim=-1) # Mean loss over the batch @@ -117,11 +117,11 @@ class CrossEntropyBayesRisk(Module): &= -\log \Bigg( \int \Big[ \sum_{k=1}^K -y_k \log p_k \Big] - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ - &= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg) + &= \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg) \end{align} where $\psi(\cdot)$ is the $digamma$ function. @@ -132,12 +132,12 @@ class CrossEntropyBayesRisk(Module): * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ - # $\color{cyan}{\alpha_k} = e_k + 1$ + # $\color{orange}{\alpha_k} = e_k + 1$ alpha = evidence + 1. - # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ + # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$ strength = alpha.sum(dim=-1) - # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{cyan}{\alpha_k} ) \bigg)$ + # Losses $\mathcal{L}(\Theta) = \sum_{k=1}^K y_k \bigg( \psi(S) - \psi( \color{orange}{\alpha_k} ) \bigg)$ loss = (target * (torch.digamma(strength)[:, None] - torch.digamma(alpha))).sum(dim=-1) # Mean loss over the batch @@ -159,19 +159,19 @@ class SquaredErrorBayesRisk(Module): &= -\log \Bigg( \int \Big[ \sum_{k=1}^K (y_k - p_k)^2 \Big] - \frac{1}{B(\color{cyan}{\mathbf{\alpha}})} - \prod_{k=1}^K p_k^{\color{cyan}{\alpha_k} - 1} + \frac{1}{B(\color{orange}{\mathbf{\alpha}})} + \prod_{k=1}^K p_k^{\color{orange}{\alpha_k} - 1} d\mathbf{p} \Bigg ) \\ &= \sum_{k=1}^K \mathbb{E} \Big[ y_k^2 -2 y_k p_k + p_k^2 \Big] \\ &= \sum_{k=1}^K \Big( y_k^2 -2 y_k \mathbb{E}[p_k] + \mathbb{E}[p_k^2] \Big) \end{align} - Where $$\mathbb{E}[p_k] = \hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$$ + Where $$\mathbb{E}[p_k] = \hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$$ is the expected probability when sampled from the Dirichlet distribution and $$\mathbb{E}[p_k^2] = \mathbb{E}[p_k]^2 + \text{Var}(p_k)$$ where - $$\text{Var}(p_k) = \frac{\color{cyan}{\alpha_k}(S - \color{cyan}{\alpha_k})}{S^2 (S + 1)} + $$\text{Var}(p_k) = \frac{\color{orange}{\alpha_k}(S - \color{orange}{\alpha_k})}{S^2 (S + 1)} = \frac{\hat{p}_k(1 - \hat{p}_k)}{S + 1}$$ is the variance. @@ -193,11 +193,11 @@ class SquaredErrorBayesRisk(Module): * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ - # $\color{cyan}{\alpha_k} = e_k + 1$ + # $\color{orange}{\alpha_k} = e_k + 1$ alpha = evidence + 1. - # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ + # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$ strength = alpha.sum(dim=-1) - # $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$ + # $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$ p = alpha / strength[:, None] # Error $(y_k -\hat{p}_k)^2$ @@ -219,7 +219,7 @@ class KLDivergenceLoss(Module): This tries to shrink the total evidence to zero if the sample cannot be correctly classified. - First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$ the + First we calculate $\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$ the Dirichlet parameters after remove the correct evidence. \begin{align} @@ -240,12 +240,12 @@ class KLDivergenceLoss(Module): * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ - # $\color{cyan}{\alpha_k} = e_k + 1$ + # $\color{orange}{\alpha_k} = e_k + 1$ alpha = evidence + 1. # Number of classes n_classes = evidence.shape[-1] # Remove non-misleading evidence - # $$\tilde{\alpha}_k = y_k + (1 - y_k) \color{cyan}{\alpha_k}$$ + # $$\tilde{\alpha}_k = y_k + (1 - y_k) \color{orange}{\alpha_k}$$ alpha_tilde = target + (1 - target) * alpha # $\tilde{S} = \sum_{k=1}^K \tilde{\alpha}_k$ strength_tilde = alpha_tilde.sum(dim=-1) @@ -292,12 +292,12 @@ class TrackStatistics(Module): # Track accuracy tracker.add('accuracy.', match.sum() / match.shape[0]) - # $\color{cyan}{\alpha_k} = e_k + 1$ + # $\color{orange}{\alpha_k} = e_k + 1$ alpha = evidence + 1. - # $S = \sum_{k=1}^K \color{cyan}{\alpha_k}$ + # $S = \sum_{k=1}^K \color{orange}{\alpha_k}$ strength = alpha.sum(dim=-1) - # $\hat{p}_k = \frac{\color{cyan}{\alpha_k}}{S}$ + # $\hat{p}_k = \frac{\color{orange}{\alpha_k}}{S}$ expected_probability = alpha / strength[:, None] # Expected probability of the selected (greedy highset probability) class expected_probability, _ = expected_probability.max(dim=-1)