cpu_profiling_en.html 29.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
11
  <title>Profiling the Python Code &mdash; PaddlePaddle  documentation</title>
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  

  
  
        <link rel="index" title="Index"
              href="../../genindex.html"/>
        <link rel="search" title="Search" href="../../search.html"/>
    <link rel="top" title="PaddlePaddle  documentation" href="../../index.html"/> 

  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/override.css" type="text/css" />
  <script>
  var _hmt = _hmt || [];
  (function() {
    var hm = document.createElement("script");
    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
    var s = document.getElementsByTagName("script")[0]; 
    s.parentNode.insertBefore(hm, s);
  })();
  </script>

  

  
  <script src="../../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

  
  <header class="site-header">
    <div class="site-logo">
      <a href="/"><img src="../../_static/images/PP_w.png"></a>
    </div>
    <div class="site-nav-links">
      <div class="site-menu">
        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
        <div class="language-switcher dropdown">
          <a type="button" data-toggle="dropdown">
            <span>English</span>
            <i class="fa fa-angle-up"></i>
            <i class="fa fa-angle-down"></i>
          </a>
          <ul class="dropdown-menu">
            <li><a href="/doc_cn">中文</a></li>
            <li><a href="/doc">English</a></li>
          </ul>
        </div>
        <ul class="site-page-links">
          <li><a href="/">Home</a></li>
        </ul>
      </div>
      <div class="doc-module">
        
        <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_en.html">GET STARTED</a></li>
85
<li class="toctree-l1"><a class="reference internal" href="../../build_and_install/index_en.html">Install and Build</a></li>
86
<li class="toctree-l1"><a class="reference internal" href="../index_en.html">HOW TO</a></li>
87
<li class="toctree-l1"><a class="reference internal" href="../../dev/index_en.html">Development</a></li>
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
</ul>

        
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>        
      </div>
    </div>
  </header>
  
  <div class="main-content-wrap">

    
    <nav class="doc-menu-vertical" role="navigation">
        
          
          <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_en.html">GET STARTED</a><ul>
110
<li class="toctree-l2"><a class="reference internal" href="../../getstarted/quickstart_en.html">Quick Start</a></li>
111 112
</ul>
</li>
113 114 115 116
<li class="toctree-l1"><a class="reference internal" href="../../build_and_install/index_en.html">Install and Build</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../build_and_install/pip_install_en.html">Install Using pip</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../build_and_install/docker_install_en.html">Run in Docker Containers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../build_and_install/build_from_source_en.html">Build from Sources</a></li>
117 118 119
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../index_en.html">HOW TO</a><ul>
120 121 122 123
<li class="toctree-l2"><a class="reference internal" href="../cmd_parameter/index_en.html">Set Command-line Parameters</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../cmd_parameter/use_case_en.html">Use Case</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cmd_parameter/arguments_en.html">Argument Outline</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cmd_parameter/detail_introduction_en.html">Detail Description</a></li>
124 125
</ul>
</li>
126 127 128 129 130 131 132 133
<li class="toctree-l2"><a class="reference internal" href="../cluster/index_en.html">Distributed Training</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../cluster/preparations_en.html">Preparations</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cluster/cmd_argument_en.html">Command-line arguments</a></li>
<li class="toctree-l3"><a class="reference internal" href="../cluster/multi_cluster/index_en.html">Use different clusters</a><ul>
<li class="toctree-l4"><a class="reference internal" href="../cluster/multi_cluster/fabric_en.html">Cluster Training Using Fabric</a></li>
<li class="toctree-l4"><a class="reference internal" href="../cluster/multi_cluster/openmpi_en.html">Cluster Training Using OpenMPI</a></li>
<li class="toctree-l4"><a class="reference internal" href="../cluster/multi_cluster/k8s_en.html">PaddlePaddle On Kubernetes</a></li>
<li class="toctree-l4"><a class="reference internal" href="../cluster/multi_cluster/k8s_aws_en.html">Distributed PaddlePaddle Training on AWS with Kubernetes</a></li>
134 135
</ul>
</li>
136 137 138 139
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../rnn/index_en.html">RNN Models</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../rnn/rnn_config_en.html">RNN Configuration</a></li>
140 141 142 143 144
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="gpu_profiling_en.html">Tune GPU Performance</a></li>
</ul>
</li>
145 146 147 148 149 150
<li class="toctree-l1"><a class="reference internal" href="../../dev/index_en.html">Development</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../dev/new_layer_en.html">Write New Layers</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../dev/contribute_to_paddle_en.html">Contribute Code</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../dev/write_docs_en.html">Contribute Documentation</a></li>
</ul>
</li>
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
</ul>

        
    </nav>
    
    <section class="doc-content-wrap">

      

 







<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
      
171
    <li>Profiling the Python Code</li>
172 173 174 175 176 177 178 179
  </ul>
</div>
      
      <div class="wy-nav-content" id="doc-content">
        <div class="rst-content">
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
180
  <p>This tutorial introduces techniques we use to profile and tune the
181
CPU performance of PaddlePaddle.  We will use Python packages
182 183
<code class="docutils literal"><span class="pre">cProfile</span></code> and <code class="docutils literal"><span class="pre">yep</span></code>, and Google&#8217;s <code class="docutils literal"><span class="pre">perftools</span></code>.</p>
<p>Profiling is the process that reveals performance bottlenecks,
184
which could be very different from what&#8217;s in the developers&#8217; mind.
185
Performance tuning is done to fix these bottlenecks. Performance optimization
186
repeats the steps of profiling and tuning alternatively.</p>
187
<p>PaddlePaddle users program AI applications by calling the Python API, which calls
188 189 190 191 192 193 194 195 196 197 198 199 200
into <code class="docutils literal"><span class="pre">libpaddle.so.</span></code> written in C++.  In this tutorial, we focus on
the profiling and tuning of</p>
<ol class="simple">
<li>the Python code and</li>
<li>the mixture of Python and C++ code.</li>
</ol>
<div class="section" id="profiling-the-python-code">
<span id="profiling-the-python-code"></span><h1>Profiling the Python Code<a class="headerlink" href="#profiling-the-python-code" title="Permalink to this headline"></a></h1>
<div class="section" id="generate-the-performance-profiling-file">
<span id="generate-the-performance-profiling-file"></span><h2>Generate the Performance Profiling File<a class="headerlink" href="#generate-the-performance-profiling-file" title="Permalink to this headline"></a></h2>
<p>We can use Python standard
package, <a class="reference external" href="https://docs.python.org/2/library/profile.html"><code class="docutils literal"><span class="pre">cProfile</span></code></a>,
to generate Python profiling file.  For example:</p>
201 202 203
<div class="highlight-bash"><div class="highlight"><pre><span></span>python -m cProfile -o profile.out main.py
</pre></div>
</div>
204 205 206
<p>where <code class="docutils literal"><span class="pre">main.py</span></code> is the program we are going to profile, <code class="docutils literal"><span class="pre">-o</span></code> specifies
the output file.  Without <code class="docutils literal"><span class="pre">-o</span></code>, <code class="docutils literal"><span class="pre">cProfile</span></code> would outputs to standard
output.</p>
207
</div>
208 209 210 211 212
<div class="section" id="look-into-the-profiling-file">
<span id="look-into-the-profiling-file"></span><h2>Look into the Profiling File<a class="headerlink" href="#look-into-the-profiling-file" title="Permalink to this headline"></a></h2>
<p><code class="docutils literal"><span class="pre">cProfile</span></code> generates <code class="docutils literal"><span class="pre">profile.out</span></code> after <code class="docutils literal"><span class="pre">main.py</span></code> completes. We can
use <a class="reference external" href="https://github.com/ymichael/cprofilev"><code class="docutils literal"><span class="pre">cprofilev</span></code></a> to look into
the details:</p>
213 214 215
<div class="highlight-bash"><div class="highlight"><pre><span></span>cprofilev -a <span class="m">0</span>.0.0.0 -p <span class="m">3214</span> -f profile.out main.py
</pre></div>
</div>
216 217 218 219 220 221
<p>where <code class="docutils literal"><span class="pre">-a</span></code> specifies the HTTP IP, <code class="docutils literal"><span class="pre">-p</span></code> specifies the port, <code class="docutils literal"><span class="pre">-f</span></code>
specifies the profiling file, and <code class="docutils literal"><span class="pre">main.py</span></code> is the source file.</p>
<p>Open the Web browser and points to the local IP and the specifies
port, we will see the output like the following:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span>   <span class="n">ncalls</span>  <span class="n">tottime</span>  <span class="n">percall</span>  <span class="n">cumtime</span>  <span class="n">percall</span> <span class="n">filename</span><span class="p">:</span><span class="n">lineno</span><span class="p">(</span><span class="n">function</span><span class="p">)</span>
        <span class="mi">1</span>    <span class="mf">0.284</span>    <span class="mf">0.284</span>   <span class="mf">29.514</span>   <span class="mf">29.514</span> <span class="n">main</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">1</span><span class="p">(</span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span><span class="p">)</span>
222
     <span class="mi">4696</span>    <span class="mf">0.128</span>    <span class="mf">0.000</span>   <span class="mf">15.748</span>    <span class="mf">0.003</span> <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">executor</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">20</span><span class="p">(</span><span class="n">run</span><span class="p">)</span>
223 224
     <span class="mi">4696</span>   <span class="mf">12.040</span>    <span class="mf">0.003</span>   <span class="mf">12.040</span>    <span class="mf">0.003</span> <span class="p">{</span><span class="n">built</span><span class="o">-</span><span class="ow">in</span> <span class="n">method</span> <span class="n">run</span><span class="p">}</span>
        <span class="mi">1</span>    <span class="mf">0.144</span>    <span class="mf">0.144</span>    <span class="mf">6.534</span>    <span class="mf">6.534</span> <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="fm">__init__</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">14</span><span class="p">(</span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span><span class="p">)</span>
225 226
</pre></div>
</div>
227 228 229
<p>where each line corresponds to Python function, and the meaning of
each column is as follows:</p>
<p>| column | meaning |
230
| &#8212; | &#8212; |
231
| ncalls | the number of calls into a function |
232
| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
233 234 235 236
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |
| filename:lineno(function) | where the function is defined |</p>
237
</div>
238 239 240 241
<div class="section" id="identify-performance-bottlenecks">
<span id="identify-performance-bottlenecks"></span><h2>Identify Performance Bottlenecks<a class="headerlink" href="#identify-performance-bottlenecks" title="Permalink to this headline"></a></h2>
<p>Usually, <code class="docutils literal"><span class="pre">tottime</span></code> and the related <code class="docutils literal"><span class="pre">percall</span></code> time is what we want to
focus on. We can sort above profiling file by tottime:</p>
242 243
<div class="highlight-text"><div class="highlight"><pre><span></span>     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
244 245 246
   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:219(__init__)
     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/framework.py:428(sync_with_cpp)
        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/fluid/__init__.py:1(&lt;module&gt;)
247 248
</pre></div>
</div>
249
<p>We can see that the most time-consuming function is the <code class="docutils literal"><span class="pre">built-in</span> <span class="pre">method</span> <span class="pre">run</span></code>, which is a C++ function in <code class="docutils literal"><span class="pre">libpaddle.so</span></code>.  We will
250
explain how to profile C++ code in the next section.  At this
251 252 253
moment, let&#8217;s look into the third function <code class="docutils literal"><span class="pre">sync_with_cpp</span></code>, which is a
Python function.  We can click it to understand more about it:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Called</span> <span class="n">By</span><span class="p">:</span>
254

255 256
   <span class="n">Ordered</span> <span class="n">by</span><span class="p">:</span> <span class="n">internal</span> <span class="n">time</span>
   <span class="n">List</span> <span class="n">reduced</span> <span class="kn">from</span> <span class="mi">4497</span> <span class="n">to</span> <span class="mi">2</span> <span class="n">due</span> <span class="n">to</span> <span class="n">restriction</span> <span class="o">&lt;</span><span class="s1">&#39;sync_with_cpp&#39;</span><span class="o">&gt;</span>
257

258 259
<span class="n">Function</span>                                                                                                 <span class="n">was</span> <span class="n">called</span> <span class="n">by</span><span class="o">...</span>
                                                                                                             <span class="n">ncalls</span>  <span class="n">tottime</span>  <span class="n">cumtime</span>
260 261 262
<span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">428</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>  <span class="o">&lt;-</span>    <span class="mi">4697</span>    <span class="mf">0.626</span>    <span class="mf">2.291</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">562</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>
<span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">562</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>  <span class="o">&lt;-</span>    <span class="mi">4696</span>    <span class="mf">0.019</span>    <span class="mf">2.316</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">487</span><span class="p">(</span><span class="n">clone</span><span class="p">)</span>
                                                                                                                  <span class="mi">1</span>    <span class="mf">0.000</span>    <span class="mf">0.001</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">534</span><span class="p">(</span><span class="n">append_backward</span><span class="p">)</span>
263 264


265
<span class="n">Called</span><span class="p">:</span>
266

267 268
   <span class="n">Ordered</span> <span class="n">by</span><span class="p">:</span> <span class="n">internal</span> <span class="n">time</span>
   <span class="n">List</span> <span class="n">reduced</span> <span class="kn">from</span> <span class="mi">4497</span> <span class="n">to</span> <span class="mi">2</span> <span class="n">due</span> <span class="n">to</span> <span class="n">restriction</span> <span class="o">&lt;</span><span class="s1">&#39;sync_with_cpp&#39;</span><span class="o">&gt;</span>
269 270
</pre></div>
</div>
271 272
<p>The lists of the callers of <code class="docutils literal"><span class="pre">sync_with_cpp</span></code> might help us understand
how to improve the function definition.</p>
273 274
</div>
</div>
275 276 277 278 279 280 281 282 283 284 285
<div class="section" id="profiling-python-and-c-code">
<span id="profiling-python-and-c-code"></span><h1>Profiling Python and C++ Code<a class="headerlink" href="#profiling-python-and-c-code" title="Permalink to this headline"></a></h1>
<div class="section" id="generate-the-profiling-file">
<span id="generate-the-profiling-file"></span><h2>Generate the Profiling File<a class="headerlink" href="#generate-the-profiling-file" title="Permalink to this headline"></a></h2>
<p>To profile a mixture of Python and C++ code, we can use a Python
package, <code class="docutils literal"><span class="pre">yep</span></code>, that can work with Google&#8217;s <code class="docutils literal"><span class="pre">perftools</span></code>, which is a
commonly-used profiler for C/C++ code.</p>
<p>In Ubuntu systems, we can install <code class="docutils literal"><span class="pre">yep</span></code> and <code class="docutils literal"><span class="pre">perftools</span></code> by running the
following commands:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>apt update
apt install libgoogle-perftools-dev
286 287 288
pip install yep
</pre></div>
</div>
289
<p>Then we can run the following command</p>
290 291 292
<div class="highlight-bash"><div class="highlight"><pre><span></span>python -m yep -v main.py
</pre></div>
</div>
293 294 295
<p>to generate the profiling file.  The default filename is
<code class="docutils literal"><span class="pre">main.py.prof</span></code>.</p>
<p>Please be aware of the <code class="docutils literal"><span class="pre">-v</span></code> command line option, which prints the
296 297
analysis results after generating the profiling file.  By examining the
the print result, we&#8217;d know that if we stripped debug
298 299
information from <code class="docutils literal"><span class="pre">libpaddle.so</span></code> at build time.  The following hints
help make sure that the analysis results are readable:</p>
300
<ol class="simple">
301 302 303 304 305 306 307 308 309 310 311 312
<li>Use GCC command line option <code class="docutils literal"><span class="pre">-g</span></code> when building <code class="docutils literal"><span class="pre">libpaddle.so</span></code> so to
include the debug information.  The standard building system of
PaddlePaddle is CMake, so you might want to set
<code class="docutils literal"><span class="pre">CMAKE_BUILD_TYPE=RelWithDebInfo</span></code>.</li>
<li>Use GCC command line option <code class="docutils literal"><span class="pre">-O2</span></code> or <code class="docutils literal"><span class="pre">-O3</span></code> to generate optimized
binary code. It doesn&#8217;t make sense to profile <code class="docutils literal"><span class="pre">libpaddle.so</span></code>
without optimization, because it would anyway run slowly.</li>
<li>Profiling the single-threaded binary file before the
multi-threading version, because the latter often generates tangled
profiling analysis result.  You might want to set environment
variable <code class="docutils literal"><span class="pre">OMP_NUM_THREADS=1</span></code> to prevents OpenMP from automatically
starting multiple threads.</li>
313 314
</ol>
</div>
315 316 317
<div class="section" id="examining-the-profiling-file">
<span id="examining-the-profiling-file"></span><h2>Examining the Profiling File<a class="headerlink" href="#examining-the-profiling-file" title="Permalink to this headline"></a></h2>
<p>The tool we used to examine the profiling file generated by
318 319 320 321
<code class="docutils literal"><span class="pre">perftools</span></code> is <a class="reference external" href="https://github.com/google/pprof"><code class="docutils literal"><span class="pre">pprof</span></code></a>, which
provides a Web-based GUI like <code class="docutils literal"><span class="pre">cprofilev</span></code>.</p>
<p>We can rely on the standard Go toolchain to retrieve the source code
of <code class="docutils literal"><span class="pre">pprof</span></code> and build it:</p>
322 323 324
<div class="highlight-bash"><div class="highlight"><pre><span></span>go get github.com/google/pprof
</pre></div>
</div>
325 326
<p>Then we can use it to profile <code class="docutils literal"><span class="pre">main.py.prof</span></code> generated in the previous
section:</p>
327 328 329
<div class="highlight-bash"><div class="highlight"><pre><span></span>pprof -http<span class="o">=</span><span class="m">0</span>.0.0.0:3213 <span class="sb">`</span>which python<span class="sb">`</span>  ./main.py.prof
</pre></div>
</div>
330 331 332
<p>Where <code class="docutils literal"><span class="pre">-http</span></code> specifies the IP and port of the HTTP service.
Directing our Web browser to the service, we would see something like
the following:</p>
333 334
<p><img alt="result" src="../../_images/pprof_1.png" /></p>
</div>
335 336 337 338
<div class="section" id="identifying-the-performance-bottlenecks">
<span id="identifying-the-performance-bottlenecks"></span><h2>Identifying the Performance Bottlenecks<a class="headerlink" href="#identifying-the-performance-bottlenecks" title="Permalink to this headline"></a></h2>
<p>Similar to how we work with <code class="docutils literal"><span class="pre">cprofilev</span></code>, we&#8217;d focus on <code class="docutils literal"><span class="pre">tottime</span></code> and
<code class="docutils literal"><span class="pre">cumtime</span></code>.</p>
339
<p><img alt="kernel_perf" src="../../_images/pprof_2.png" /></p>
340 341 342 343 344
<p>We can see that the execution time of multiplication and the computing
of the gradient of multiplication takes 2% to 4% of the total running
time, and <code class="docutils literal"><span class="pre">MomentumOp</span></code> takes about 17%. Obviously, we&#8217;d want to
optimize <code class="docutils literal"><span class="pre">MomentumOp</span></code>.</p>
<p><code class="docutils literal"><span class="pre">pprof</span></code> would mark performance critical parts of the program in
345
red. It&#8217;s a good idea to follow the hints.</p>
346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
</div>
</div>


           </div>
          </div>
          <footer>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
            HAS_SOURCE:  true,
            SOURCELINK_SUFFIX: ".txt",
        };
    </script>
      <script type="text/javascript" src="../../_static/jquery.js"></script>
      <script type="text/javascript" src="../../_static/underscore.js"></script>
      <script type="text/javascript" src="../../_static/doctools.js"></script>
      <script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
       
  

  
  
    <script type="text/javascript" src="../../_static/js/theme.js"></script>
  
  
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
  <script src="../../_static/js/paddle_doc_init.js"></script> 

</body>
</html>