index.html 76.2 KB
Newer Older
V
Varuna Jayasiri 已提交
1
<!DOCTYPE html>
V
docs  
Varuna Jayasiri 已提交
2
<html lang="en">
V
Varuna Jayasiri 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
<head>
    <meta http-equiv="content-type" content="text/html;charset=utf-8"/>
    <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
    <meta name="description" content="This is an annotated implementation/tutorial of Pay Attention to MLPs (gMLP) in PyTorch."/>

    <meta name="twitter:card" content="summary"/>
    <meta name="twitter:image:src" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
    <meta name="twitter:title" content="Pay Attention to MLPs (gMLP)"/>
    <meta name="twitter:description" content="This is an annotated implementation/tutorial of Pay Attention to MLPs (gMLP) in PyTorch."/>
    <meta name="twitter:site" content="@labmlai"/>
    <meta name="twitter:creator" content="@labmlai"/>

    <meta property="og:url" content="https://nn.labml.ai/transformers/gmlp/index.html"/>
    <meta property="og:title" content="Pay Attention to MLPs (gMLP)"/>
    <meta property="og:image" content="https://avatars1.githubusercontent.com/u/64068543?s=400&amp;v=4"/>
V
Varuna Jayasiri 已提交
18
    <meta property="og:site_name" content="Pay Attention to MLPs (gMLP)"/>
V
Varuna Jayasiri 已提交
19 20 21 22 23 24
    <meta property="og:type" content="object"/>
    <meta property="og:title" content="Pay Attention to MLPs (gMLP)"/>
    <meta property="og:description" content="This is an annotated implementation/tutorial of Pay Attention to MLPs (gMLP) in PyTorch."/>

    <title>Pay Attention to MLPs (gMLP)</title>
    <link rel="shortcut icon" href="/icon.png"/>
V
Varuna Jayasiri 已提交
25
    <link rel="stylesheet" href="../../pylit.css?v=1">
V
Varuna Jayasiri 已提交
26
    <link rel="canonical" href="https://nn.labml.ai/transformers/gmlp/index.html"/>
V
Varuna Jayasiri 已提交
27 28
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.18/dist/katex.min.css" integrity="sha384-zTROYFVGOfTw7JV7KUu8udsvW2fx4lWOsCEDqhBreBwlHI4ioVRtmIvEThzJHGET" crossorigin="anonymous">

V
Varuna Jayasiri 已提交
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=G-4V3HC8HBLH"></script>
    <script>
        window.dataLayer = window.dataLayer || [];

        function gtag() {
            dataLayer.push(arguments);
        }

        gtag('js', new Date());

        gtag('config', 'G-4V3HC8HBLH');
    </script>
</head>
<body>
<div id='container'>
    <div id="background"></div>
    <div class='section'>
        <div class='docs'>
            <p>
                <a class="parent" href="/">home</a>
                <a class="parent" href="../index.html">transformers</a>
                <a class="parent" href="index.html">gmlp</a>
            </p>
            <p>
V
Varuna Jayasiri 已提交
54
                <a href="https://github.com/sponsors/labmlai" target="_blank">
V
Varuna Jayasiri 已提交
55 56
                    <img alt="Sponsor"
                         src="https://img.shields.io/static/v1?label=Sponsor&message=%E2%9D%A4&logo=GitHub&color=%23fe8e86"
V
Varuna Jayasiri 已提交
57 58
                         style="max-width:100%;"/></a>
                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations" target="_blank">
V
Varuna Jayasiri 已提交
59 60 61
                    <img alt="Github"
                         src="https://img.shields.io/github/stars/labmlai/annotated_deep_learning_paper_implementations?style=social"
                         style="max-width:100%;"/></a>
V
Varuna Jayasiri 已提交
62
                <a href="https://twitter.com/labmlai" rel="nofollow" target="_blank">
V
Varuna Jayasiri 已提交
63 64 65 66
                    <img alt="Twitter"
                         src="https://img.shields.io/twitter/follow/labmlai?style=social"
                         style="max-width:100%;"/></a>
            </p>
V
Varuna Jayasiri 已提交
67 68 69 70
            <p>
                <a href="https://github.com/labmlai/annotated_deep_learning_paper_implementations/tree/master/labml_nn/transformers/gmlp/__init__.py" target="_blank">
                    View code on Github</a>
            </p>
V
Varuna Jayasiri 已提交
71 72 73 74 75 76 77 78
        </div>
    </div>
    <div class='section' id='section-0'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-0'>#</a>
            </div>
            <h1>Pay Attention to MLPs (gMLP)</h1>
V
Varuna Jayasiri 已提交
79
<p>This is a <a href="https://pytorch.org">PyTorch</a> implementation of the paper <a href="https://papers.labml.ai/paper/2105.08050">Pay Attention to MLPs</a>.</p>
V
html  
Varuna Jayasiri 已提交
80
<p>This paper introduces a Multilayer Perceptron (MLP) based architecture with gating, which they name <strong>gMLP</strong>. It consists of a stack of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathnormal">L</span></span></span></span></span> <em>gMLP</em> blocks.</p>
V
Varuna Jayasiri 已提交
81
<p>Here is <a href="experiment.html">the training code</a> for a gMLP model based autoregressive model.</p>
V
Varuna Jayasiri 已提交
82 83
<p><a href="https://app.labml.ai/run/01bd941ac74c11eb890c1d9196651a4a"><img alt="View Run" src="https://img.shields.io/badge/labml-experiment-brightgreen"></a></p>

V
Varuna Jayasiri 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96 97
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">21</span><span></span><span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Optional</span>
<span class="lineno">22</span>
<span class="lineno">23</span><span class="kn">import</span> <span class="nn">torch</span>
<span class="lineno">24</span><span class="kn">from</span> <span class="nn">torch</span> <span class="kn">import</span> <span class="n">nn</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-1'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-1'>#</a>
            </div>
            <h2>gMLP Block</h2>
V
html  
Varuna Jayasiri 已提交
98
<p>Each block does the following transformations to input embeddings <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.72243em;vertical-align:-0.0391em;"></span><span class="mord coloredeq eqx" style=""><span class="mord mathnormal" style="margin-right:0.07847em">X</span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel"></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:0.8491079999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathbb">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8491079999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight coloredeq eqbc" style=""><span class="mord mathnormal mtight" style="">n</span></span><span class="mbin mtight">×</span><span class="mord mtight coloredeq eqba" style=""><span class="mord mathnormal mtight" style="">d</span></span></span></span></span></span></span></span></span></span></span></span></span></span> where <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqbc" style=""><span class="mord mathnormal" style="">n</span></span></span></span></span></span> is the sequence length and <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqba" style=""><span class="mord mathnormal" style="">d</span></span></span></span></span></span> is the dimensionality of the embeddings:</p>
V
Varuna Jayasiri 已提交
99
<span ><span class="katex-display"><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:4.66038em;vertical-align:-2.08019em;"></span><span class="mord"><span class="mtable"><span class="col-align-r"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:2.58019em;"><span style="top:-4.74019em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span><span style="top:-3.16em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9201899999999998em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span><span style="top:-3.6023300000000003em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.16666em;"><span class="mord">~</span></span></span></span></span></span></span></span></span><span style="top:-1.5798100000000002em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.22222em;">Y</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.08019em;"><span></span></span></span></span></span><span class="col-align-l"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:2.58019em;"><span style="top:-4.74019em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="margin-right:0.03588em">σ</span></span><span class="mopen">(</span><span class="mord coloredeq eqx" style=""><span class="mord mathnormal" style="margin-right:0.07847em">X</span></span><span class="mord coloredeq equ" style=""><span class="mord mathnormal" style="margin-right:0.10903em">U</span></span><span class="mclose">)</span></span></span><span style="top:-3.16em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span><span class="mclose">)</span></span></span><span style="top:-1.5798100000000002em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord"></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9201899999999998em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span><span style="top:-3.6023300000000003em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.16666em;"><span class="mord">~</span></span></span></span></span></span></span><span class="mord coloredeq eqv" style=""><span class="mord mathnormal" style="margin-right:0.22222em">V</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:2.08019em;"><span></span></span></span></span></span></span></span></span></span></span></span></span><p>where <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqv" style=""><span class="mord mathnormal" style="margin-right:0.22222em">V</span></span></span></span></span></span> and <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq equ" style=""><span class="mord mathnormal" style="margin-right:0.10903em">U</span></span></span></span></span></span> are learnable projection weights. <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eql" style=""><span class="mord mathnormal" style="">s</span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> is the Spacial Gating Unit defined below. Output dimensionality of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eql" style=""><span class="mord mathnormal" style="">s</span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> will be half of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span>. <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="margin-right:0.03588em">σ</span></span></span></span></span></span> is an activation function such as <a href="https://pytorch.org/docs/stable/generated/torch.nn.GELU.html">GeLU</a>.</p>
V
Varuna Jayasiri 已提交
100

V
Varuna Jayasiri 已提交
101 102 103 104 105 106 107 108 109 110
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">27</span><span class="k">class</span> <span class="nc">GMLPBlock</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-2'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-2'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
111 112 113 114 115 116
            <ul><li><code  class="highlight"><span></span><span class="n">d_model</span></code>
 is the dimensionality (<span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqba" style=""><span class="mord mathnormal" style="">d</span></span></span></span></span></span>) of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqx" style=""><span class="mord mathnormal" style="margin-right:0.07847em">X</span></span></span></span></span></span> </li>
<li><code  class="highlight"><span></span><span class="n">d_ffn</span></code>
 is the dimensionality of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span> </li>
<li><code  class="highlight"><span></span><span class="n">seq_len</span></code>
 is the length of the token sequence (<span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqbc" style=""><span class="mord mathnormal" style="">n</span></span></span></span></span></span>)</li></ul>
V
Varuna Jayasiri 已提交
117

V
Varuna Jayasiri 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">48</span>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">d_model</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">d_ffn</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-3'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-3'>#</a>
            </div>
            
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">54</span>        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-4'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-4'>#</a>
            </div>
V
Varuna Jayasiri 已提交
139 140
            <p>Normalization layer fro Pre-Norm </p>

V
Varuna Jayasiri 已提交
141 142 143 144 145 146 147 148 149 150
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">56</span>        <span class="bp">self</span><span class="o">.</span><span class="n">norm</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">LayerNorm</span><span class="p">([</span><span class="n">d_model</span><span class="p">])</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-5'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-5'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
151
            <p>Activation function <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.43056em;vertical-align:0em;"></span><span class="mord coloredeq eqn" style=""><span class="mord mathnormal" style="margin-right:0.03588em">σ</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
152

V
Varuna Jayasiri 已提交
153 154 155 156 157 158 159 160 161 162
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">58</span>        <span class="bp">self</span><span class="o">.</span><span class="n">activation</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">GELU</span><span class="p">()</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-6'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-6'>#</a>
            </div>
V
Varuna Jayasiri 已提交
163
            <p>Projection layer for <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel" style="">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqn" style="margin-right:0.03588em">σ</span></span><span class="mopen" style="">(</span><span class="mord" style=""><span class="mord mathnormal coloredeq eqx" style="margin-right:0.07847em">X</span></span><span class="mord" style=""><span class="mord mathnormal coloredeq equ" style="margin-right:0.10903em">U</span></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
164

V
Varuna Jayasiri 已提交
165 166 167 168 169 170 171 172 173 174
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">60</span>        <span class="bp">self</span><span class="o">.</span><span class="n">proj1</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">d_model</span><span class="p">,</span> <span class="n">d_ffn</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-7'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-7'>#</a>
            </div>
V
Varuna Jayasiri 已提交
175
            <p>Spacial Gating Unit <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eql" style=""><span class="mord mathnormal" style="">s</span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
176

V
Varuna Jayasiri 已提交
177 178 179 180 181 182 183 184 185 186
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">62</span>        <span class="bp">self</span><span class="o">.</span><span class="n">sgu</span> <span class="o">=</span> <span class="n">SpacialGatingUnit</span><span class="p">(</span><span class="n">d_ffn</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-8'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-8'>#</a>
            </div>
V
Varuna Jayasiri 已提交
187
            <p>Projection layer for <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.9201899999999998em;vertical-align:0em;"></span><span class="mord coloredeq eqh" style=""><span class="mord mathnormal" style="margin-right:0.22222em">Y</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel" style="">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord accent" style=""><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9201899999999998em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span></span><span style="top:-3.6023300000000003em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.16666em;"><span class="mord" style="">~</span></span></span></span></span></span></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqv" style="margin-right:0.22222em">V</span></span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
188

V
Varuna Jayasiri 已提交
189 190 191 192 193 194 195 196 197 198
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">64</span>        <span class="bp">self</span><span class="o">.</span><span class="n">proj2</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Linear</span><span class="p">(</span><span class="n">d_ffn</span> <span class="o">//</span> <span class="mi">2</span><span class="p">,</span> <span class="n">d_model</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-9'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-9'>#</a>
            </div>
V
Varuna Jayasiri 已提交
199 200
            <p>Embedding size (required by <a href="../models.html#Encoder">Encoder</a>. We use the encoder module from transformer architecture and plug <em>gMLP</em> block as a replacement for the <a href="../models.html#Encoder">Transformer Layer</a>. </p>

V
Varuna Jayasiri 已提交
201 202 203 204 205 206 207 208 209 210
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">68</span>        <span class="bp">self</span><span class="o">.</span><span class="n">size</span> <span class="o">=</span> <span class="n">d_model</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-10'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-10'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
211 212
            <ul><li><code  class="highlight"><span></span><span class="n">x</span></code>
 is the input embedding tensor <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqx" style=""><span class="mord mathnormal" style="margin-right:0.07847em">X</span></span></span></span></span></span> of shape <code  class="highlight"><span></span><span class="p">[</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="n">d_model</span><span class="p">]</span></code>
V
Varuna Jayasiri 已提交
213
 </li>
V
html  
Varuna Jayasiri 已提交
214 215
<li><code  class="highlight"><span></span><span class="n">mask</span></code>
 is a boolean mask of shape <code  class="highlight"><span></span><span class="p">[</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span></code>
V
Varuna Jayasiri 已提交
216 217
 that controls the visibility of tokens  among each other.</li></ul>

V
Varuna Jayasiri 已提交
218 219 220 221 222 223 224 225 226 227
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">70</span>    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-11'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-11'>#</a>
            </div>
V
Varuna Jayasiri 已提交
228 229
            <p>Keep a copy for shortcut connection </p>

V
Varuna Jayasiri 已提交
230 231 232 233 234 235 236 237 238 239
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">77</span>        <span class="n">shortcut</span> <span class="o">=</span> <span class="n">x</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-12'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-12'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
240
            <p>Normalize <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqx" style=""><span class="mord mathnormal" style="margin-right:0.07847em">X</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
241

V
Varuna Jayasiri 已提交
242 243 244 245 246 247 248 249 250 251
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">79</span>        <span class="n">x</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="n">x</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-13'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-13'>#</a>
            </div>
V
Varuna Jayasiri 已提交
252
            <p>Projection and activation <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eqi" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel" style="">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqn" style="margin-right:0.03588em">σ</span></span><span class="mopen" style="">(</span><span class="mord" style=""><span class="mord mathnormal coloredeq eqx" style="margin-right:0.07847em">X</span></span><span class="mord" style=""><span class="mord mathnormal coloredeq equ" style="margin-right:0.10903em">U</span></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
253

V
Varuna Jayasiri 已提交
254 255 256 257 258 259 260 261 262 263
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">81</span>        <span class="n">z</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">activation</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">proj1</span><span class="p">(</span><span class="n">x</span><span class="p">))</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-14'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-14'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
264
            <p>Spacial Gating Unit <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.9201899999999998em;vertical-align:0em;"></span><span class="mord accent"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9201899999999998em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span><span style="top:-3.6023300000000003em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.16666em;"><span class="mord">~</span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span><span class="mclose">)</span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
265

V
Varuna Jayasiri 已提交
266 267 268 269 270 271 272 273 274 275
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">83</span>        <span class="n">z</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">sgu</span><span class="p">(</span><span class="n">z</span><span class="p">,</span> <span class="n">mask</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-15'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-15'>#</a>
            </div>
V
Varuna Jayasiri 已提交
276
            <p>Final projection <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.9201899999999998em;vertical-align:0em;"></span><span class="mord coloredeq eqh" style=""><span class="mord mathnormal" style="margin-right:0.22222em">Y</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel" style="">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mord accent" style=""><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.9201899999999998em;"><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span></span><span style="top:-3.6023300000000003em;"><span class="pstrut" style="height:3em;"></span><span class="accent-body" style="left:-0.16666em;"><span class="mord" style="">~</span></span></span></span></span></span></span><span class="mord" style=""><span class="mord mathnormal coloredeq eqv" style="margin-right:0.22222em">V</span></span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
277

V
Varuna Jayasiri 已提交
278 279 280 281 282 283 284 285 286 287
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">85</span>        <span class="n">z</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">proj2</span><span class="p">(</span><span class="n">z</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-16'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-16'>#</a>
            </div>
V
Varuna Jayasiri 已提交
288 289
            <p>Add the shortcut connection </p>

V
Varuna Jayasiri 已提交
290 291 292 293 294 295 296 297 298 299 300
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">88</span>        <span class="k">return</span> <span class="n">z</span> <span class="o">+</span> <span class="n">shortcut</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-17'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-17'>#</a>
            </div>
            <h2>Spatial Gating Unit</h2>
V
Varuna Jayasiri 已提交
301 302
<p><span ><span class="katex-display"><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal">s</span><span class="mopen">(</span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqe" style=""><span class="mord" style=""><span class="mord coloredeq eqp" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight coloredeq eqs" style="">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mord" style=""><span class="mord coloredeq eqo" style=""></span></span><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mclose" style="">)</span></span></span></span></span></span></span></p>
<p>where <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight coloredeq eqw" style=""><span class="mord mathnormal mtight" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight">,</span><span class="mord mtight coloredeq eqz" style=""><span class="mord mathnormal mtight" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:0.76666em;vertical-align:-0.08333em;"></span><span class="mord coloredeq eqw" style=""><span class="mord mathnormal" style="margin-right:0.13889em">W</span></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqz" style=""><span class="mord mathnormal" style="">b</span></span></span></span></span></span> is a linear transformation along the sequence dimension, and <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.66666em;vertical-align:-0.08333em;"></span><span class="mord coloredeq eqo" style=""><span class="mord" style=""></span></span></span></span></span></span> is element-wise multiplication. <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span> is split into to parts of equal size <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqp" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight coloredeq eqs" style="">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> and <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> along the channel dimension (embedding dimension).</p>
V
Varuna Jayasiri 已提交
303

V
Varuna Jayasiri 已提交
304 305 306 307 308 309 310 311 312 313
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">91</span><span class="k">class</span> <span class="nc">SpacialGatingUnit</span><span class="p">(</span><span class="n">nn</span><span class="o">.</span><span class="n">Module</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-18'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-18'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
314 315 316
            <ul><li><code  class="highlight"><span></span><span class="n">d_z</span></code>
 is the dimensionality of <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span> </li>
<li><code  class="highlight"><span></span><span class="n">seq_len</span></code>
V
Varuna Jayasiri 已提交
317 318
 is the sequence length</li></ul>

V
Varuna Jayasiri 已提交
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">101</span>    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">d_z</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-19'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-19'>#</a>
            </div>
            
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">106</span>        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">()</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-20'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-20'>#</a>
            </div>
V
Varuna Jayasiri 已提交
340
            <p>Normalization layer before applying <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqk" style=""><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
341

V
Varuna Jayasiri 已提交
342 343 344 345 346 347 348 349 350 351
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">108</span>        <span class="bp">self</span><span class="o">.</span><span class="n">norm</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">LayerNorm</span><span class="p">([</span><span class="n">d_z</span> <span class="o">//</span> <span class="mi">2</span><span class="p">])</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-21'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-21'>#</a>
            </div>
V
Varuna Jayasiri 已提交
352 353
            <p>Weight <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqw" style=""><span class="mord mathnormal" style="margin-right:0.13889em">W</span></span></span></span></span></span> in <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqk" style=""><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span>.</p>
<p>The paper notes that it&#x27;s important to initialize weights to small values and the bias to <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord coloredeq eqs" style=""><span class="mord" style="">1</span></span></span></span></span></span>, so that during the initial training <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord coloredeq eql" style=""><span class="mord mathnormal" style="">s</span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> is close to identity (apart from the split). </p>
V
Varuna Jayasiri 已提交
354

V
Varuna Jayasiri 已提交
355 356 357 358 359 360 361 362 363 364
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">113</span>        <span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">)</span><span class="o">.</span><span class="n">uniform_</span><span class="p">(</span><span class="o">-</span><span class="mf">0.01</span><span class="p">,</span> <span class="mf">0.01</span><span class="p">),</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-22'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-22'>#</a>
            </div>
V
Varuna Jayasiri 已提交
365
            <p>Weight <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqz" style=""><span class="mord mathnormal" style="">b</span></span></span></span></span></span> in <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqk" style=""><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span></p>
V
html  
Varuna Jayasiri 已提交
366
<p>The paper notes that it&#x27;s important to initialize bias to <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord coloredeq eqs" style=""><span class="mord" style="">1</span></span></span></span></span></span>. </p>
V
Varuna Jayasiri 已提交
367

V
Varuna Jayasiri 已提交
368 369 370 371 372 373 374 375 376 377
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">117</span>        <span class="bp">self</span><span class="o">.</span><span class="n">bias</span> <span class="o">=</span> <span class="n">nn</span><span class="o">.</span><span class="n">Parameter</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="n">seq_len</span><span class="p">),</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-23'>
        <div class='docs doc-strings'>
            <div class='section-link'>
                <a href='#section-23'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
378 379
            <ul><li><code  class="highlight"><span></span><span class="n">z</span></code>
 is the input <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span> of shape <code  class="highlight"><span></span><span class="p">[</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">,</span> <span class="n">d_z</span><span class="p">]</span></code>
V
Varuna Jayasiri 已提交
380
 </li>
V
html  
Varuna Jayasiri 已提交
381 382 383
<li><code  class="highlight"><span></span><span class="n">mask</span></code>
 is is a boolean mask of shape <code  class="highlight"><span></span><span class="p">[</span><span class="n">seq_len</span><span class="p">,</span> <span class="n">seq_len</span><span class="p">,</span> <span class="mi">1</span><span class="p">]</span></code>
 that controls the visibility of tokens  among each other. The last dimension of size <code  class="highlight"><span></span><span class="mi">1</span></code>
V
Varuna Jayasiri 已提交
384 385
 is the batch, which we have in other transformer  implementations and was left for compatibility.</li></ul>

V
Varuna Jayasiri 已提交
386 387 388 389 390 391 392 393 394 395
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">119</span>    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">z</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">):</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-24'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-24'>#</a>
            </div>
V
Varuna Jayasiri 已提交
396 397
            <p>Get sequence length </p>

V
Varuna Jayasiri 已提交
398 399 400 401 402 403 404 405 406 407
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">128</span>        <span class="n">seq_len</span> <span class="o">=</span> <span class="n">z</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-25'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-25'>#</a>
            </div>
V
Varuna Jayasiri 已提交
408
            <p>Split <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord coloredeq eqy" style=""><span class="mord mathnormal" style="margin-right:0.07153em">Z</span></span></span></span></span></span> into <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqp" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight coloredeq eqs" style="">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> and <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
409

V
Varuna Jayasiri 已提交
410 411 412 413 414 415 416 417 418 419
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">130</span>        <span class="n">z1</span><span class="p">,</span> <span class="n">z2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">chunk</span><span class="p">(</span><span class="n">z</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">dim</span><span class="o">=-</span><span class="mi">1</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-26'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-26'>#</a>
            </div>
V
Varuna Jayasiri 已提交
420 421
            <p>Check mask </p>

V
Varuna Jayasiri 已提交
422 423 424 425 426 427 428 429 430 431
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">133</span>        <span class="k">if</span> <span class="n">mask</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-27'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-27'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
432 433 434
            <p><code  class="highlight"><span></span><span class="n">mask</span></code>
 has shape <code  class="highlight"><span></span><span class="p">[</span><span class="n">seq_len_q</span><span class="p">,</span> <span class="n">seq_len_k</span><span class="p">,</span> <span class="n">batch_size</span><span class="p">]</span></code>
. The batch dimension should be of size <code  class="highlight"><span></span><span class="mi">1</span></code>
V
Varuna Jayasiri 已提交
435 436
 because this implementation supports only same mask for all samples in the batch. </p>

V
Varuna Jayasiri 已提交
437 438 439 440 441 442 443 444 445 446 447
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">137</span>            <span class="k">assert</span> <span class="n">mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">or</span> <span class="n">mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">==</span> <span class="n">seq_len</span>
<span class="lineno">138</span>            <span class="k">assert</span> <span class="n">mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="n">seq_len</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-28'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-28'>#</a>
            </div>
V
Varuna Jayasiri 已提交
448 449
            <p>Here we only support the same mask for all samples </p>

V
Varuna Jayasiri 已提交
450 451 452 453 454 455 456 457 458 459
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">140</span>            <span class="k">assert</span> <span class="n">mask</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-29'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-29'>#</a>
            </div>
V
Varuna Jayasiri 已提交
460 461
            <p>Remove the batch dimension </p>

V
Varuna Jayasiri 已提交
462 463 464 465 466 467 468 469 470 471
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">142</span>            <span class="n">mask</span> <span class="o">=</span> <span class="n">mask</span><span class="p">[:,</span> <span class="p">:,</span> <span class="mi">0</span><span class="p">]</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-30'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-30'>#</a>
            </div>
V
Varuna Jayasiri 已提交
472
            <p>Normalize <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span></span> before <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqk" style=""><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
473

V
Varuna Jayasiri 已提交
474 475 476 477 478 479 480 481 482 483
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">145</span>        <span class="n">z2</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">norm</span><span class="p">(</span><span class="n">z2</span><span class="p">)</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-31'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-31'>#</a>
            </div>
V
html  
Varuna Jayasiri 已提交
484
            <p>Get the weight matrix; truncate if larger than <code  class="highlight"><span></span><span class="n">seq_len</span></code>
V
Varuna Jayasiri 已提交
485 486
 </p>

V
Varuna Jayasiri 已提交
487 488 489 490 491 492 493 494 495 496 497
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">147</span>        <span class="n">weight</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">weight</span><span class="p">[:</span><span class="n">seq_len</span><span class="p">,</span> <span class="p">:</span><span class="n">seq_len</span><span class="p">]</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-32'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-32'>#</a>
            </div>
            <p>Apply mask to the weights.</p>
V
Varuna Jayasiri 已提交
498
<p>If <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.969438em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord coloredeq eqw" style=""><span class="mord mathnormal" style="margin-right:0.13889em">W</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mpunct mtight">,</span><span class="mord mtight coloredeq eqbb" style=""><span class="mord mathnormal mtight" style="margin-right:0.05724em">j</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span></span></span></span></span> is <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord">0</span></span></span></span></span> then <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight coloredeq eqw" style=""><span class="mord mathnormal mtight" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight">,</span><span class="mord mtight coloredeq eqz" style=""><span class="mord mathnormal mtight" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mclose"><span class="mclose">)</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathnormal mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span></span> will not get any information from token <span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:0.85396em;vertical-align:-0.19444em;"></span><span class="mord coloredeq eqbb" style=""><span class="mord mathnormal" style="margin-right:0.05724em">j</span></span></span></span></span></span>. </p>
V
Varuna Jayasiri 已提交
499

V
Varuna Jayasiri 已提交
500 501 502 503 504 505 506 507 508 509 510
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">152</span>        <span class="k">if</span> <span class="n">mask</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="lineno">153</span>            <span class="n">weight</span> <span class="o">=</span> <span class="n">weight</span> <span class="o">*</span> <span class="n">mask</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-33'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-33'>#</a>
            </div>
V
Varuna Jayasiri 已提交
511
            <p><span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight coloredeq eqw" style=""><span class="mord mathnormal mtight" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight">,</span><span class="mord mtight coloredeq eqz" style=""><span class="mord mathnormal mtight" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:0.83333em;vertical-align:-0.15em;"></span><span class="mord coloredeq eqw" style=""><span class="mord mathnormal" style="margin-right:0.13889em">W</span></span><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord coloredeq eqz" style=""><span class="mord mathnormal" style="">b</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
512

V
Varuna Jayasiri 已提交
513 514 515 516 517 518 519 520 521 522
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">156</span>        <span class="n">z2</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">einsum</span><span class="p">(</span><span class="s1">&#39;ij,jbd-&gt;ibd&#39;</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">z2</span><span class="p">)</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">bias</span><span class="p">[:</span><span class="n">seq_len</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">]</span></pre></div>
        </div>
    </div>
    <div class='section' id='section-34'>
        <div class='docs'>
            <div class='section-link'>
                <a href='#section-34'>#</a>
            </div>
V
Varuna Jayasiri 已提交
523
            <p><span ><span class="katex"><span aria-hidden="true" class="katex-html"><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord coloredeq eqe" style=""><span class="mord" style=""><span class="mord coloredeq eqp" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight coloredeq eqs" style="">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mord" style=""><span class="mord coloredeq eqo" style=""></span></span><span class="mord" style=""><span class="mord mathnormal" style="margin-right:0.10764em">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style=""><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqw" style="margin-right:0.13889em">W</span></span><span class="mpunct mtight" style="">,</span><span class="mord mtight" style=""><span class="mord mathnormal mtight coloredeq eqz" style="">b</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mopen" style="">(</span><span class="mord" style=""><span class="mord coloredeq eqq" style=""><span class="mord" style=""><span class="mord mathnormal coloredeq eqy" style="margin-right:0.07153em">Z</span></span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight" style=""><span class="mord mtight" style="">2</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span><span class="mclose" style="">)</span></span></span></span></span></span> </p>
V
Varuna Jayasiri 已提交
524

V
Varuna Jayasiri 已提交
525 526 527 528 529 530 531 532 533 534
        </div>
        <div class='code'>
            <div class="highlight"><pre><span class="lineno">159</span>        <span class="k">return</span> <span class="n">z1</span> <span class="o">*</span> <span class="n">z2</span></pre></div>
        </div>
    </div>
    <div class='footer'>
        <a href="https://papers.labml.ai">Trending Research Papers</a>
        <a href="https://labml.ai">labml.ai</a>
    </div>
</div>
V
Varuna Jayasiri 已提交
535
<script src=../../interactive.js?v=1"></script>
V
Varuna Jayasiri 已提交
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
<script>
    function handleImages() {
        var images = document.querySelectorAll('p>img')

        for (var i = 0; i < images.length; ++i) {
            handleImage(images[i])
        }
    }

    function handleImage(img) {
        img.parentElement.style.textAlign = 'center'

        var modal = document.createElement('div')
        modal.id = 'modal'

        var modalContent = document.createElement('div')
        modal.appendChild(modalContent)

        var modalImage = document.createElement('img')
        modalContent.appendChild(modalImage)

        var span = document.createElement('span')
        span.classList.add('close')
        span.textContent = 'x'
        modal.appendChild(span)

        img.onclick = function () {
            console.log('clicked')
            document.body.appendChild(modal)
            modalImage.src = img.src
        }

        span.onclick = function () {
            document.body.removeChild(modal)
        }
    }

    handleImages()
</script>
</body>
</html>