parameter_server.html 10.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
11
  <title>Design Doc: Parameter Server &mdash; PaddlePaddle  文档</title>
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  

  
31

32 33 34 35 36 37 38 39 40 41 42 43 44
  
        <link rel="index" title="索引"
              href="../../genindex.html"/>
        <link rel="search" title="搜索" href="../../search.html"/>
    <link rel="top" title="PaddlePaddle  文档" href="../../index.html"/> 

  
  <script src="../../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

45 46 47 48 49 50 51 52 53 54 55 56 57
  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">
          

          
            <a href="../../index_cn.html" class="icon icon-home"> PaddlePaddle
          

          
58 59
          </a>

60 61 62 63 64 65
          
            
            
          

          
66 67 68 69 70 71
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
72
</div>
73 74

          
75 76 77 78 79 80 81 82 83 84 85 86
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
                <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_cn.html">新手入门</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../build_and_install/index_cn.html">安装与编译</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../howto/index_cn.html">进阶使用</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../dev/index_cn.html">开发标准</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../faq/index_cn.html">FAQ</a></li>
87 88
</ul>

89 90 91 92
            
          
        </div>
      </div>
93 94
    </nav>

95
    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
96

97 98 99 100 101
      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
        <a href="../../index_cn.html">PaddlePaddle</a>
      </nav>
102 103


104 105 106 107
      
      <div class="wy-nav-content">
        <div class="rst-content">
          
108

109
 
110 111 112 113 114



<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
115
    <li><a href="../../index_cn.html">Docs</a> &raquo;</li>
116
      
117
    <li>Design Doc: Parameter Server</li>
118 119 120 121 122 123 124
      <li class="wy-breadcrumbs-aside">
        
          
            <a href="../../_sources/design/dist_refactor/parameter_server.md.txt" rel="nofollow"> View page source</a>
          
        
      </li>
125
  </ul>
126
  <hr/>
127 128 129 130
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
131 132
  <div class="section" id="design-doc-parameter-server">
<span id="design-doc-parameter-server"></span><h1>Design Doc: Parameter Server<a class="headerlink" href="#design-doc-parameter-server" title="永久链接至标题"></a></h1>
133 134 135 136 137 138 139 140 141
<div class="section" id="abstract">
<span id="abstract"></span><h2>Abstract<a class="headerlink" href="#abstract" title="永久链接至标题"></a></h2>
<p>We propose an approach to implement the parameter server. In this
approach, there is no fundamental difference between the trainer and
the parameter server: they both run subgraphs, but subgraphs of
different purposes.</p>
</div>
<div class="section" id="background">
<span id="background"></span><h2>Background<a class="headerlink" href="#background" title="永久链接至标题"></a></h2>
142
<p>The previous implementations of the parameter server do not run a
143
fluid sub-program. Parameter initialization, optimizer computation, network
144
communication and checkpointing are implemented twice on both the
145 146 147 148 149 150
trainer as well as the parameter server.</p>
<p>It would be great if we can write code once and use them on both: the
trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactoring, we are
representing everything as a computation graph on the
trainer. Representing everything as a computation graph on the parameter
151 152 153 154
server becomes a natural extension.</p>
</div>
<div class="section" id="design">
<span id="design"></span><h2>Design<a class="headerlink" href="#design" title="永久链接至标题"></a></h2>
155 156 157 158
<div class="section" id="distributed-transpiler">
<span id="distributed-transpiler"></span><h3>Distributed Transpiler<a class="headerlink" href="#distributed-transpiler" title="永久链接至标题"></a></h3>
<p>The <em>Distributed Transpiler</em> converts the user-defined fluid program
into sub-programs to be scheduled on different nodes with the following
159 160 161
steps:</p>
<ol class="simple">
<li>OP placement: the OPs will be placed on different nodes according
162
to a heuristic that minimizes the estimated total computation
163
time. Currently we will use a simple heuristic that puts parameter
164
variable on parameter server workers and everything else on trainer
165 166 167 168 169 170 171 172 173 174
workers.</li>
<li>Add communication OPs to enable the communication between nodes.</li>
</ol>
<p>We will need these OPs: <em>Send</em>, <em>Recv</em>, <em>Enqueue</em>, <em>Dequeue</em>.</p>
<p>Below is an example of converting the user defined graph to the
subgraphs for the trainer and the parameter server:</p>
<p><img src="src/local-graph.png" width="300"/></p>
<p>After converting:</p>
<p><img src="src/dist-graph.png" width="700"/></p>
<ol class="simple">
175
<li>The parameter variable W and its optimizer program are placed on the parameter server.</li>
176
<li>Operators are added to the program.<ul>
177 178 179 180
<li><em>Send</em> sends data to the connected <em>Recv</em> operator.  The
scheduler on the receive node will only schedule <em>Recv</em> operator
to run when the <em>Send</em> operator has ran (the <em>Send</em> OP will mark
the <em>Recv</em> OP runnable automatically).</li>
181
<li><em>Enqueue</em> enqueues the input variable, it can block until space
182 183
become available in the queue.</li>
<li><em>Dequeue</em> outputs configurable numbers of tensors from the
184
queue. It will block until the queue has the required number of
185 186 187 188 189 190 191 192
tensors.</li>
</ul>
</li>
</ol>
</div>
<div class="section" id="benefits">
<span id="benefits"></span><h3>Benefits<a class="headerlink" href="#benefits" title="永久链接至标题"></a></h3>
<ul class="simple">
193
<li>Model parallelism becomes easier to implement: it is an extension to
194 195
the trainer - parameter server approach. We can have several &#8220;Transpilers&#8221;
to achieve different goals.</li>
196
<li>User-defined optimizer is easier to add - user can now express it as
197
a sub-program.</li>
198 199 200 201 202 203 204
<li>No more duplication logic inside the trainer and the parameter
server mentioned in the background section.</li>
</ul>
</div>
<div class="section" id="challenges">
<span id="challenges"></span><h3>Challenges<a class="headerlink" href="#challenges" title="永久链接至标题"></a></h3>
<ul class="simple">
205 206
<li>It is important to balance the parameter shards on multiple
parameter servers. If a single parameter is very big (for example: some
207 208 209 210
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).</li>
211 212
<li>In the &#8220;Async SGD&#8221; figure, the &#8220;W&#8221; variable on the parameter server
could be read and written concurrently. See
213
<a class="reference external" href="https://github.com/PaddlePaddle/Paddle/pull/6394">here</a> for more
214
details about concurrent program in Fluid.</li>
215 216 217 218 219 220
</ul>
</div>
<div class="section" id="discussion">
<span id="discussion"></span><h3>Discussion<a class="headerlink" href="#discussion" title="永久链接至标题"></a></h3>
<ul class="simple">
<li>Can the Enqueue OP be implemented under our current tensor design
221 222
(put the input tensor into the queue tensor)?</li>
<li><em>Dequeue</em> OP will have variable numbers of output (depending on the
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
<code class="docutils literal"><span class="pre">min_count</span></code> attribute), does our current design support it? (similar
question for the <em>Add</em> OP)</li>
</ul>
</div>
<div class="section" id="references">
<span id="references"></span><h3>References:<a class="headerlink" href="#references" title="永久链接至标题"></a></h3>
<p>[1] <a class="reference external" href="https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf">TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems</a></p>
</div>
</div>
</div>


           </div>
          </div>
          <footer>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
269
            HAS_SOURCE:  true
270 271 272 273 274 275 276
        };
    </script>
      <script type="text/javascript" src="../../_static/jquery.js"></script>
      <script type="text/javascript" src="../../_static/underscore.js"></script>
      <script type="text/javascript" src="../../_static/doctools.js"></script>
      <script type="text/javascript" src="../../_static/translations.js"></script>
      <script type="text/javascript" src="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"></script>
277

278 279 280 281 282 283
  

  
  
    <script type="text/javascript" src="../../_static/js/theme.js"></script>
  
284

285
  
286 287 288 289 290 291 292
  
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.StickyNav.enable();
      });
  </script>
   
293 294 295

</body>
</html>