openmpi_en.html 9.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
11
  <title>OpenMPI &mdash; PaddlePaddle  documentation</title>
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../../../_static/css/theme.css" type="text/css" />
  

  
31

32 33 34 35
  
        <link rel="index" title="Index"
              href="../../../genindex.html"/>
        <link rel="search" title="Search" href="../../../search.html"/>
36
    <link rel="top" title="PaddlePaddle  documentation" href="../../../index.html"/>
37
        <link rel="up" title="Use different clusters" href="index_en.html"/>
38 39
        <link rel="next" title="Kubernetes" href="k8s_en.html"/>
        <link rel="prev" title="Fabric" href="fabric_en.html"/> 
40 41 42 43 44 45 46 47

  
  <script src="../../../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

48 49 50 51 52 53 54 55 56 57 58 59 60
  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">
          

          
            <a href="../../../index_en.html" class="icon icon-home"> PaddlePaddle
          

          
61 62
          </a>

63 64 65 66 67 68
          
            
            
          

          
69 70 71 72 73 74
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
75
</div>
76 77

          
78 79 80 81 82 83 84 85 86
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
                <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../../getstarted/index_en.html">GET STARTED</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../build_and_install/index_en.html">Install and Build</a></li>
87
<li class="toctree-l1 current"><a class="reference internal" href="../../index_en.html">HOW TO</a><ul class="current">
88
<li class="toctree-l2"><a class="reference internal" href="../../cmd_parameter/index_en.html">Set Command-line Parameters</a></li>
89 90 91 92
<li class="toctree-l2 current"><a class="reference internal" href="../index_en.html">Distributed Training</a><ul class="current">
<li class="toctree-l3"><a class="reference internal" href="../preparations_en.html">Preparations</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cmd_argument_en.html">Command-line arguments</a></li>
<li class="toctree-l3 current"><a class="reference internal" href="index_en.html">Use different clusters</a><ul class="current">
93 94 95 96
<li class="toctree-l4"><a class="reference internal" href="fabric_en.html">Fabric</a></li>
<li class="toctree-l4 current"><a class="current reference internal" href="#">OpenMPI</a></li>
<li class="toctree-l4"><a class="reference internal" href="k8s_en.html">Kubernetes</a></li>
<li class="toctree-l4"><a class="reference internal" href="k8s_aws_en.html">Kubernetes on AWS</a></li>
97 98
</ul>
</li>
99 100
</ul>
</li>
101
<li class="toctree-l2"><a class="reference internal" href="../../rnn/index_en.html">RNN Models</a></li>
102 103 104
<li class="toctree-l2"><a class="reference internal" href="../../optimization/gpu_profiling_en.html">Tune GPU Performance</a></li>
</ul>
</li>
105 106
<li class="toctree-l1"><a class="reference internal" href="../../../dev/index_en.html">Development</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../faq/index_en.html">FAQ</a></li>
107 108
</ul>

109 110 111 112
            
          
        </div>
      </div>
113 114
    </nav>

115
    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
116

117 118 119 120 121
      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
        <a href="../../../index_en.html">PaddlePaddle</a>
      </nav>
122 123


124 125 126 127
      
      <div class="wy-nav-content">
        <div class="rst-content">
          
128

129
 
130 131 132 133 134



<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
135
    <li><a href="../../../index_en.html">Docs</a> &raquo;</li>
136
      
137
          <li><a href="../../index_en.html">HOW TO</a> &raquo;</li>
138
      
139
          <li><a href="../index_en.html">Distributed Training</a> &raquo;</li>
140
      
141
          <li><a href="index_en.html">Use different clusters</a> &raquo;</li>
142
      
143
    <li>OpenMPI</li>
144 145 146 147 148 149 150
      <li class="wy-breadcrumbs-aside">
        
          
            <a href="../../../_sources/howto/cluster/multi_cluster/openmpi_en.md.txt" rel="nofollow"> View page source</a>
          
        
      </li>
151
  </ul>
152
  <hr/>
153 154 155 156
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
157 158
  <div class="section" id="openmpi">
<span id="openmpi"></span><h1>OpenMPI<a class="headerlink" href="#openmpi" title="Permalink to this headline"></a></h1>
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
<div class="section" id="prepare-an-openmpi-cluster">
<span id="prepare-an-openmpi-cluster"></span><h2>Prepare an OpenMPI cluster<a class="headerlink" href="#prepare-an-openmpi-cluster" title="Permalink to this headline"></a></h2>
<p>Run the following command to start a 3-node MPI cluster and one &#8220;head&#8221; node.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="nb">cd</span> paddle/scripts/cluster_train_v2/openmpi/docker_cluster
kubectl create -f head.yaml
kubectl create -f mpi-nodes.yaml
</pre></div>
</div>
<p>Then you can log in to every OpenMPI node using ssh without input any passwords.</p>
</div>
<div class="section" id="launching-cluster-job">
<span id="launching-cluster-job"></span><h2>Launching Cluster Job<a class="headerlink" href="#launching-cluster-job" title="Permalink to this headline"></a></h2>
<p>Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="c1"># find out node IP addresses</span>
kubectl get po -o wide
<span class="c1"># generate a &quot;machines&quot; file containing node IP addresses</span>
kubectl get po -o wide <span class="p">|</span> grep nodes <span class="p">|</span> awk <span class="s1">&#39;{print $6}&#39;</span> &gt; machines
<span class="c1"># copy necessary files onto &quot;head&quot; node</span>
scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@<span class="o">[</span>headIP<span class="o">]</span>:~
<span class="c1"># login to head node using ssh</span>
ssh -i ssh/id_rsa.mpi.pub tutorial@<span class="o">[</span>headIP<span class="o">]</span>
<span class="c1"># --------------- in head node ---------------</span>
<span class="c1"># prepare training data</span>
python prepare.py
<span class="c1"># copy training data and dict file to MPI nodes</span>
cat machines <span class="p">|</span> xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines <span class="o">{}</span>:/home/tutorial
<span class="c1"># creat a directory for storing log files</span>
mpirun -hostfile machines -n <span class="m">3</span> mkdir /home/tutorial/logs
<span class="c1"># copy training data to every node</span>
scp train.txt-00000 test.txt-00000 <span class="o">[</span>node1IP<span class="o">]</span>:/home/tutorial
scp train.txt-00001 test.txt-00001 <span class="o">[</span>node2IP<span class="o">]</span>:/home/tutorial
scp train.txt-00002 test.txt-00002 <span class="o">[</span>node3IP<span class="o">]</span>:/home/tutorial
<span class="c1"># start the job</span>
mpirun -hostfile machines -n <span class="m">3</span>  /home/tutorial/start_mpi_train.sh
</pre></div>
</div>
</div>
</div>


           </div>
          </div>
          <footer>
  
203 204
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
205
        <a href="k8s_en.html" class="btn btn-neutral float-right" title="Kubernetes" accesskey="n">Next <span class="fa fa-arrow-circle-right"></span></a>
206 207
      
      
208
        <a href="fabric_en.html" class="btn btn-neutral" title="Fabric" accesskey="p"><span class="fa fa-arrow-circle-left"></span> Previous</a>
209 210 211
      
    </div>
  
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../../../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
242
            HAS_SOURCE:  true
243 244 245 246 247 248
        };
    </script>
      <script type="text/javascript" src="../../../_static/jquery.js"></script>
      <script type="text/javascript" src="../../../_static/underscore.js"></script>
      <script type="text/javascript" src="../../../_static/doctools.js"></script>
      <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
249

250 251 252 253 254 255
  

  
  
    <script type="text/javascript" src="../../../_static/js/theme.js"></script>
  
256

257
  
258 259 260 261 262 263 264
  
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.StickyNav.enable();
      });
  </script>
   
265 266 267

</body>
</html>