parallel_do.html 22.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>Design Doc: Parallel_Do in PaddlePaddle &mdash; PaddlePaddle  documentation</title>
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../_static/css/theme.css" type="text/css" />
  

  
31

32 33 34 35 36 37 38 39 40 41 42 43 44
  
        <link rel="index" title="Index"
              href="../genindex.html"/>
        <link rel="search" title="Search" href="../search.html"/>
    <link rel="top" title="PaddlePaddle  documentation" href="../index.html"/> 

  
  <script src="../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

45 46 47 48 49 50 51 52 53 54 55 56 57
  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">
          

          
            <a href="../index_en.html" class="icon icon-home"> PaddlePaddle
          

          
58 59
          </a>

60 61 62 63 64 65
          
            
            
          

          
66 67 68 69 70 71
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
72
</div>
73 74

          
75 76 77 78 79 80 81 82 83 84 85 86
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
                <ul>
<li class="toctree-l1"><a class="reference internal" href="../getstarted/index_en.html">GET STARTED</a></li>
<li class="toctree-l1"><a class="reference internal" href="../build_and_install/index_en.html">Install and Build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../howto/index_en.html">HOW TO</a></li>
<li class="toctree-l1"><a class="reference internal" href="../dev/index_en.html">Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../faq/index_en.html">FAQ</a></li>
87 88
</ul>

89 90 91 92
            
          
        </div>
      </div>
93 94
    </nav>

95
    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
96

97 98 99 100 101
      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
        <a href="../index_en.html">PaddlePaddle</a>
      </nav>
102 103


104 105 106 107
      
      <div class="wy-nav-content">
        <div class="rst-content">
          
108

109
 
110 111 112 113 114



<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
115
    <li><a href="../index_en.html">Docs</a> &raquo;</li>
116 117
      
    <li>Design Doc: Parallel_Do in PaddlePaddle</li>
118 119 120 121 122 123 124
      <li class="wy-breadcrumbs-aside">
        
          
            <a href="../_sources/design/parallel_do.md.txt" rel="nofollow"> View page source</a>
          
        
      </li>
125
  </ul>
126
  <hr/>
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="design-doc-parallel-do-in-paddlepaddle">
<span id="design-doc-parallel-do-in-paddlepaddle"></span><h1>Design Doc: Parallel_Do in PaddlePaddle<a class="headerlink" href="#design-doc-parallel-do-in-paddlepaddle" title="Permalink to this headline"></a></h1>
<p>In PaddlePaddle, we use parallel_do primitive to represent multithread data parallel processing.</p>
<div class="section" id="design-overview">
<span id="design-overview"></span><h2>Design overview<a class="headerlink" href="#design-overview" title="Permalink to this headline"></a></h2>
<p>The definition of a parallel_do op looks like the following</p>
<div class="highlight-c++"><div class="highlight"><pre><span></span><span class="n">AddInput</span><span class="p">(</span><span class="n">kInputs</span><span class="p">,</span> <span class="s">&quot;Inputs needed to be split onto different devices&quot;</span><span class="p">).</span><span class="n">AsDuplicable</span><span class="p">();</span>
<span class="n">AddInput</span><span class="p">(</span><span class="n">kParameters</span><span class="p">,</span> <span class="s">&quot;Parameters are duplicated over different devices&quot;</span><span class="p">)</span>
    <span class="p">.</span><span class="n">AsDuplicable</span><span class="p">();</span>
<span class="n">AddInput</span><span class="p">(</span><span class="n">kPlaces</span><span class="p">,</span> <span class="s">&quot;Devices used for parallel processing&quot;</span><span class="p">);</span>
<span class="n">AddOutput</span><span class="p">(</span><span class="n">kOutputs</span><span class="p">,</span> <span class="s">&quot;Outputs needed to be merged from different devices&quot;</span><span class="p">).</span><span class="n">AsDuplicable</span><span class="p">();</span>
<span class="n">AddOutput</span><span class="p">(</span><span class="n">kParallelScopes</span><span class="p">,</span>
          <span class="s">&quot;Scopes for all local variables in forward pass. One scope for each device&quot;</span><span class="p">);</span>
<span class="n">AddAttr</span><span class="o">&lt;</span><span class="n">framework</span><span class="o">::</span><span class="n">BlockDesc</span> <span class="o">*&gt;</span><span class="p">(</span><span class="n">kParallelBlock</span><span class="p">,</span>
                                <span class="s">&quot;List of operaters to be executed in parallel&quot;</span><span class="p">);</span>
</pre></div>
</div>
<p>A vanilla implementation of parallel_do can be shown as the following (<code class="docutils literal"><span class="pre">|</span></code> means single thread and
<code class="docutils literal"><span class="pre">||||</span></code> means multiple threads)</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">In</span> <span class="n">the</span> <span class="n">forward</span> <span class="k">pass</span>
  <span class="o">|</span>      <span class="n">Split</span> <span class="nb">input</span> <span class="n">onto</span> <span class="n">different</span> <span class="n">devices</span>
152
  <span class="o">|</span>      <span class="n">Copy</span> <span class="n">parameter</span> <span class="n">onto</span> <span class="n">different</span> <span class="n">devices</span>
153 154 155 156 157 158 159 160 161 162 163 164
  <span class="o">||||</span>   <span class="n">Compute</span> <span class="n">forward</span> <span class="k">pass</span> <span class="ow">in</span> <span class="n">parallel</span>
  <span class="o">|</span>      <span class="n">Merge</span> <span class="n">output</span> <span class="kn">from</span> <span class="nn">different</span> <span class="n">devices</span>

<span class="n">In</span> <span class="n">the</span> <span class="n">backward</span> <span class="k">pass</span>
  <span class="o">|</span>      <span class="n">Split</span> <span class="n">output</span><span class="nd">@grad</span> <span class="n">onto</span> <span class="n">different</span> <span class="n">devices</span>
  <span class="o">||||</span>   <span class="n">Compute</span> <span class="n">backward</span> <span class="k">pass</span> <span class="ow">in</span> <span class="n">parallel</span>
  <span class="o">|</span>      <span class="n">accumulate</span> <span class="n">param</span><span class="nd">@grad</span> <span class="kn">from</span> <span class="nn">different</span> <span class="n">devices</span> <span class="n">to</span> <span class="n">the</span> <span class="n">first</span> <span class="n">device</span>
  <span class="o">|</span>      <span class="n">Merge</span> <span class="nb">input</span><span class="nd">@grad</span> <span class="kn">from</span> <span class="nn">different</span> <span class="n">devices</span>
  <span class="o">|</span>      <span class="n">Copy</span> <span class="n">param</span><span class="nd">@grad</span> <span class="n">to</span> <span class="n">the</span> <span class="n">place</span> <span class="n">of</span> <span class="n">parallel_do_op</span>
</pre></div>
</div>
<p>This implementation allows to write mixed device program like this</p>
165 166
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="n">W1</span> <span class="o">=</span> <span class="n">fluid</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">[</span><span class="mi">100</span><span class="p">,</span><span class="mi">20</span><span class="p">],</span> <span class="n">parameter</span><span class="o">=</span><span class="n">true</span><span class="p">)</span>
<span class="n">W2</span> <span class="o">=</span> <span class="n">fluid</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">[</span><span class="mi">20</span><span class="p">,</span><span class="mi">15</span><span class="p">],</span> <span class="n">parameter</span><span class="o">=</span><span class="n">true</span><span class="p">)</span>
167

168 169 170
<span class="n">data</span> <span class="o">=</span> <span class="n">layers</span><span class="o">.</span><span class="n">data</span><span class="p">()</span>

<span class="n">gpu_places</span> <span class="o">=</span> <span class="n">layers</span><span class="o">.</span><span class="n">get_place</span><span class="p">(</span><span class="n">use_gpu</span><span class="o">=</span><span class="bp">True</span><span class="p">)</span>
171 172
<span class="c1"># parallel processing on multiple GPUs</span>
<span class="n">pd</span> <span class="o">=</span> <span class="n">ParallelDo</span><span class="p">(</span><span class="n">gpu_places</span><span class="p">)</span>
173 174
<span class="k">with</span> <span class="n">pd</span><span class="o">.</span><span class="n">do</span><span class="p">(</span><span class="nb">input</span><span class="o">=</span><span class="n">data</span><span class="p">):</span>
    <span class="n">prediction</span> <span class="o">=</span> <span class="n">softmax</span><span class="p">(</span><span class="n">fc</span><span class="p">(</span><span class="n">fc</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">W1</span><span class="p">),</span> <span class="n">W2</span><span class="p">))</span>
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
    <span class="n">write_output</span><span class="p">(</span><span class="n">prediction</span><span class="p">)</span>
<span class="n">prediction</span> <span class="o">=</span> <span class="n">pd</span><span class="p">()</span>
<span class="n">loss</span> <span class="o">=</span> <span class="n">cross_entropy</span><span class="p">(</span><span class="n">prediction</span><span class="p">,</span> <span class="n">label</span><span class="p">)</span>
</pre></div>
</div>
<p>And the programDesc are like the following</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="c1"># start_program will be run by executor(CPUPlace), all w1, w2 will be allocated on CPU</span>
<span class="n">start_program</span>
<span class="p">{</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">w1</span><span class="p">,</span> <span class="n">w2</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">init</span><span class="p">(</span><span class="n">w1</span><span class="p">),</span> <span class="n">init</span><span class="p">(</span><span class="n">w2</span><span class="p">)</span>
<span class="p">}</span>

<span class="n">main_program</span>
<span class="p">{</span>
<span class="n">block0</span> <span class="p">{</span>
191
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">places</span><span class="p">,</span> <span class="n">w1</span><span class="p">,</span> <span class="n">w2</span><span class="p">,</span> <span class="n">w1_grad</span><span class="p">,</span> <span class="n">w2_grad</span><span class="p">,</span>
192 193 194 195 196
  <span class="n">ops</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">get_place</span><span class="p">,</span> <span class="n">parallel_do</span><span class="p">(</span><span class="n">block1</span><span class="p">),</span>
       <span class="n">parallel_do_grad</span><span class="p">(</span><span class="n">block2</span><span class="p">),</span>
       <span class="n">sgd</span><span class="p">(</span><span class="n">w2</span><span class="p">,</span> <span class="n">w2_grad</span><span class="p">),</span>
       <span class="n">sgd</span><span class="p">(</span><span class="n">w1</span><span class="p">,</span> <span class="n">w1_grad</span><span class="p">)</span>
<span class="p">}</span>
197
<span class="n">block1</span> <span class="p">{</span> <span class="c1"># the forward pass</span>
198 199 200 201
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">0</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span><span class="p">,</span> <span class="n">loss</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">fc</span><span class="p">,</span> <span class="n">fc</span><span class="p">,</span> <span class="n">softmax</span>
<span class="p">}</span>
202
<span class="n">block2</span> <span class="p">{</span> <span class="c1"># the backward pass</span>
203
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">1</span>
204
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data_grad</span><span class="p">,</span> <span class="n">h1_grad</span><span class="p">,</span> <span class="n">h2_grad</span><span class="p">,</span> <span class="n">loss_gard</span><span class="p">,</span> <span class="n">local_w1_grad</span><span class="p">,</span> <span class="n">local_w2_grad</span>
205 206 207 208 209 210 211 212
  <span class="n">ops</span><span class="p">:</span> <span class="n">softmax_grad</span><span class="p">,</span>
       <span class="n">fc_grad</span>
       <span class="n">fc_grad</span>
<span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
</div>
213 214
<div class="section" id="performance-imporvement">
<span id="performance-imporvement"></span><h2>Performance Imporvement<a class="headerlink" href="#performance-imporvement" title="Permalink to this headline"></a></h2>
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
<p>There are serial places we can make this parallel_do faster.</p>
<div class="section" id="forward-split-input-onto-different-devices">
<span id="forward-split-input-onto-different-devices"></span><h3>forward: split input onto different devices<a class="headerlink" href="#forward-split-input-onto-different-devices" title="Permalink to this headline"></a></h3>
<p>If the input of the parallel_do is independent from any prior opeartors, we can avoid this step by
prefetching the input onto different devices in a seperate background thread. And the python code
looks like this.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span>pd = ParallelDo(gpu_places)
with pd.do():
    feature = get_data_from_prefetch_queue(gpu_places)
    prediction = my_net(feature)
    write_output(activation)
</pre></div>
</div>
</div>
<div class="section" id="forward-copy-parameter-to-onto-different-devices">
<span id="forward-copy-parameter-to-onto-different-devices"></span><h3>forward: Copy parameter to onto different devices<a class="headerlink" href="#forward-copy-parameter-to-onto-different-devices" title="Permalink to this headline"></a></h3>
<p>We can avoid this step by making each device have a copy of the parameter. This requires:</p>
<ol class="simple">
<li><code class="docutils literal"><span class="pre">fluid.default_start_up_program()</span></code> to be run on all devices</li>
<li>In the backward, allreduce param&#64;grad at different devices, this requires<ol>
<li><code class="docutils literal"><span class="pre">backward.py</span></code> add <code class="docutils literal"><span class="pre">allreduce</span></code> operators at parallel_do_grad</li>
<li><code class="docutils literal"><span class="pre">allreduce</span></code> operators need to be called in async mode to achieve maximum throughput</li>
</ol>
</li>
<li>apply gradients related op(i.e. cliping, normalization, decay, sgd) on different devices in parallel</li>
</ol>
<p>By doing so, we also avoided &#8220;backward: accumulate param&#64;grad from different devices to the first device&#8221;.
And the ProgramDesc looks like the following</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="c1"># w1, w2 will be allocated on all GPUs</span>
<span class="n">start_program</span>
<span class="p">{</span>
<span class="n">block0</span> <span class="p">{</span>
  <span class="n">parallel_do</span><span class="p">(</span><span class="n">block1</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">block1</span> <span class="p">{</span>
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">0</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">w1</span><span class="p">,</span> <span class="n">w2</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">init</span><span class="p">(</span><span class="n">w1</span><span class="p">),</span> <span class="n">init</span><span class="p">(</span><span class="n">w2</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">}</span>

<span class="n">main_program</span>
<span class="p">{</span>
<span class="n">block0</span> <span class="p">{</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">places</span><span class="p">,</span> <span class="n">w1</span><span class="p">,</span> <span class="n">w2</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">get_place</span><span class="p">,</span> <span class="n">parallel_do</span><span class="p">(</span><span class="n">block1</span><span class="p">),</span>
       <span class="n">parallel_do_grad</span><span class="p">(</span><span class="n">block2</span><span class="p">),</span>      <span class="c1"># append_backward</span>
       <span class="n">parallel_do</span><span class="p">(</span><span class="n">block3</span><span class="p">)</span>            <span class="c1"># append_optimization</span>
       
<span class="p">}</span>
<span class="n">block1</span> <span class="p">{</span>
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">0</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data</span><span class="p">,</span> <span class="n">h1</span><span class="p">,</span> <span class="n">h2</span><span class="p">,</span> <span class="n">loss</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">fc</span><span class="p">,</span> <span class="n">fc</span><span class="p">,</span> <span class="n">softmax</span>
<span class="p">}</span>
<span class="n">block2</span> <span class="p">{</span>
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">1</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">data_grad</span><span class="p">,</span> <span class="n">h1_grad</span><span class="p">,</span> <span class="n">h2_grad</span><span class="p">,</span> <span class="n">loss_gard</span><span class="p">,</span> <span class="n">w1_grad</span><span class="p">,</span> <span class="n">w2_grad</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">softmax_grad</span><span class="p">,</span>
       <span class="n">fc_grad</span><span class="p">,</span> <span class="n">allreduce</span><span class="p">(</span><span class="n">places</span><span class="p">,</span> <span class="n">scopes</span><span class="p">,</span> <span class="n">w1_grad</span><span class="p">),</span>
       <span class="n">fc_grad</span><span class="p">,</span> <span class="n">allreduce</span><span class="p">(</span><span class="n">places</span><span class="p">,</span> <span class="n">scopes</span><span class="p">,</span> <span class="n">w2_grad</span><span class="p">)</span>
<span class="p">}</span>
<span class="n">block3</span> <span class="p">{</span>
  <span class="n">parent_block</span><span class="p">:</span> <span class="mi">0</span>
  <span class="nb">vars</span><span class="p">:</span> <span class="n">lr</span>
  <span class="n">ops</span><span class="p">:</span> <span class="n">sgd</span><span class="p">(</span><span class="n">w2</span><span class="p">,</span> <span class="n">w2_grad</span><span class="p">),</span>
       <span class="n">sgd</span><span class="p">(</span><span class="n">w1</span><span class="p">,</span> <span class="n">w1_grad</span><span class="p">)</span>
<span class="p">}</span>
<span class="p">}</span>
</pre></div>
</div>
</div>
</div>
</div>


           </div>
          </div>
          <footer>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
325
            HAS_SOURCE:  true
326 327 328 329 330 331
        };
    </script>
      <script type="text/javascript" src="../_static/jquery.js"></script>
      <script type="text/javascript" src="../_static/underscore.js"></script>
      <script type="text/javascript" src="../_static/doctools.js"></script>
      <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
332

333 334 335 336 337 338
  

  
  
    <script type="text/javascript" src="../_static/js/theme.js"></script>
  
339

340
  
341 342 343 344 345 346 347
  
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.StickyNav.enable();
      });
  </script>
   
348 349 350

</body>
</html>