Merge branch 'master' of github.com:lab-ml/nn

merge

Merge branch 'master' of github.com:lab-ml/nn
merge
4f47bfef · Varuna Jayasiri · d5c29517 · a8b8e48c · 4f47bfef · 4f47bfef
7 changed file
--- a/docs/sitemap.xml
+++ b/docs/sitemap.xml
@@ -603,14 +603,14 @@

    <url>
      <loc>https://nn.labml.ai/transformers/switch/index.html</loc>
-      <lastmod>2021-08-17T16:30:00+00:00</lastmod>
+      <lastmod>2021-09-17T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

    <url>
      <loc>https://nn.labml.ai/transformers/switch/experiment.html</loc>
-      <lastmod>2021-09-06T16:30:00+00:00</lastmod>
+      <lastmod>2021-09-17T16:30:00+00:00</lastmod>
      <priority>1.00</priority>
    </url>
    

--- a/docs/transformers/switch/experiment.html
+++ b/docs/transformers/switch/experiment.html
--- a/docs/transformers/switch/index.html
+++ b/docs/transformers/switch/index.html
--- a/docs/transformers/switch/readme.html
+++ b/docs/transformers/switch/readme.html
@@ -90,7 +90,7 @@ In a distributed setup you would have each FFN (each very large) on a different
 discusses dropping tokens when routing is not balanced.</p>
 <p>Here&rsquo;s <a href="experiment.html">the training code</a> and a notebook for training a switch transformer on Tiny Shakespeare dataset.</p>
 <p><a href="https://colab.research.google.com/github/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/transformers/switch/experiment.ipynb"><img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
-<a href="https://app.labml.ai/run/c4656c605b9311eba13d0242ac1c0002"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
+<a href="https://app.labml.ai/run/353770ce177c11ecaa5fb74452424f46"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen" /></a></p>
        </div>
        <div class='code'>
            

--- a/labml_nn/transformers/switch/__init__.py
+++ b/labml_nn/transformers/switch/__init__.py
@@ -143,16 +143,13 @@ class SwitchFeedForward(Module):
            dropped = torch.cat(dropped)
            final_output[dropped, :] = x[dropped, :]

-        # Scale the outputs of the the experts by the routing probabilities
        if self.is_scale_prob:
-            factor = route_prob_max
-        # Don't scale the values but multiply by $\frac{p}{\hat{p}} = 1$ so that the gradients flow
-        # (this is just something we experimented with)
+            # Multiply by the expert outputs by the probabilities $y = p_i(x) E_i(x)$
+            final_output = final_output * route_prob_max.view(-1, 1)
        else:
-            factor = route_prob_max / route_prob_max.detach()
-
-        # Multiply by the scaling factor
-        final_output = final_output * factor.view(-1, 1)
+            # Don't scale the values but multiply by $\frac{p}{\hat{p}} = 1$ so that the gradients flow
+            # (this is something we experimented with).
+            final_output = final_output * (route_prob_max / route_prob_max.detach()).view(-1, 1)

        # Change the shape of the final output back to `[seq_len, batch_size, d_model]`
        final_output = final_output.view(seq_len, batch_size, d_model)

--- a/papers/2109.02869.pdf
+++ b/papers/2109.02869.pdf
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ with open("readme.md", "r") as f:

 setuptools.setup(
    name='labml-nn',
-    version='0.4.112',
+    version='0.4.113',
    author="Varuna Jayasiri, Nipun Wijerathne",
    author_email="vpjayasiri@gmail.com, hnipun@gmail.com",
    description="🧑‍🏫 Implementations/tutorials of deep learning papers with side-by-side notes 📝; including transformers (original, xl, switch, feedback, vit), optimizers (adam, radam, adabelief), gans(dcgan, cyclegan, stylegan2), 🎮 reinforcement learning (ppo, dqn), capsnet, distillation, etc. 🧠",