cpu_profiling.html 33.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
11
  <title>Profiling the Python Code &mdash; PaddlePaddle  文档</title>
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  

  
  
        <link rel="index" title="索引"
              href="../../genindex.html"/>
        <link rel="search" title="搜索" href="../../search.html"/>
    <link rel="top" title="PaddlePaddle  文档" href="../../index.html"/> 

  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/override.css" type="text/css" />
  <script>
  var _hmt = _hmt || [];
  (function() {
    var hm = document.createElement("script");
    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
    var s = document.getElementsByTagName("script")[0]; 
    s.parentNode.insertBefore(hm, s);
  })();
  </script>

  

  
  <script src="../../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

  
  <header class="site-header">
    <div class="site-logo">
      <a href="/"><img src="../../_static/images/PP_w.png"></a>
    </div>
    <div class="site-nav-links">
      <div class="site-menu">
        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
        <div class="language-switcher dropdown">
          <a type="button" data-toggle="dropdown">
            <span>English</span>
            <i class="fa fa-angle-up"></i>
            <i class="fa fa-angle-down"></i>
          </a>
          <ul class="dropdown-menu">
            <li><a href="/doc_cn">中文</a></li>
            <li><a href="/doc">English</a></li>
          </ul>
        </div>
        <ul class="site-page-links">
          <li><a href="/">Home</a></li>
        </ul>
      </div>
      <div class="doc-module">
        
        <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_cn.html">新手入门</a></li>
<li class="toctree-l1"><a class="reference internal" href="../index_cn.html">进阶指南</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/index_cn.html">API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../faq/index_cn.html">FAQ</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../mobile/index_cn.html">MOBILE</a></li>
</ul>

        
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>        
      </div>
    </div>
  </header>
  
  <div class="main-content-wrap">

    
    <nav class="doc-menu-vertical" role="navigation">
        
          
          <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_cn.html">新手入门</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../getstarted/build_and_install/index_cn.html">安装与编译</a><ul>
112 113
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/pip_install_cn.html">使用pip安装</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/docker_install_cn.html">使用Docker安装运行</a></li>
114
<li class="toctree-l3"><a class="reference internal" href="../dev/build_cn.html">用Docker编译和测试PaddlePaddle</a></li>
115
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/build_from_source_cn.html">从源码编译</a></li>
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../getstarted/concepts/use_concepts_cn.html">基本使用概念</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../index_cn.html">进阶指南</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../usage/cmd_parameter/index_cn.html">设置命令行参数</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../usage/cmd_parameter/use_case_cn.html">使用案例</a></li>
<li class="toctree-l3"><a class="reference internal" href="../usage/cmd_parameter/arguments_cn.html">参数概述</a></li>
<li class="toctree-l3"><a class="reference internal" href="../usage/cmd_parameter/detail_introduction_cn.html">细节描述</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../usage/cluster/cluster_train_cn.html">PaddlePaddle分布式训练</a></li>
<li class="toctree-l2"><a class="reference internal" href="../usage/k8s/k8s_basis_cn.html">Kubernetes 简介</a></li>
<li class="toctree-l2"><a class="reference internal" href="../usage/k8s/k8s_cn.html">Kubernetes单机训练</a></li>
<li class="toctree-l2"><a class="reference internal" href="../usage/k8s/k8s_distributed_cn.html">Kubernetes分布式训练</a></li>
132
<li class="toctree-l2"><a class="reference internal" href="../dev/contribute_to_paddle_cn.html">如何贡献代码</a></li>
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
<li class="toctree-l2"><a class="reference internal" href="../dev/write_docs_cn.html">如何贡献/修改文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deep_model/rnn/index_cn.html">RNN相关模型</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../deep_model/rnn/rnn_config_cn.html">RNN配置</a></li>
<li class="toctree-l3"><a class="reference internal" href="../deep_model/rnn/recurrent_group_cn.html">Recurrent Group教程</a></li>
<li class="toctree-l3"><a class="reference internal" href="../deep_model/rnn/hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../deep_model/rnn/hrnn_rnn_api_compare_cn.html">单双层RNN API对比介绍</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="gpu_profiling_cn.html">GPU性能分析与调优</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../api/index_cn.html">API</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/model_configs.html">模型配置</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/activation.html">Activation</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/layer.html">Layers</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/evaluators.html">Evaluators</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/optimizer.html">Optimizer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/pooling.html">Pooling</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/networks.html">Networks</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/attr.html">Parameter Attribute</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/data.html">数据访问</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/data/data_reader.html">Data Reader Interface</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/data/image.html">Image Interface</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/data/dataset.html">Dataset</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/run_logic.html">训练与应用</a></li>
162 163 164 165 166 167 168 169 170 171 172 173 174
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/fluid.html">Fluid</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/layers.html">Layers</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/data_feeder.html">DataFeeder</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/executor.html">Executor</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/initializer.html">Initializer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/evaluator.html">Evaluator</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/nets.html">Nets</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/optimizer.html">Optimizer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/param_attr.html">ParamAttr</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/profiler.html">Profiler</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/fluid/regularizer.html">Regularizer</a></li>
</ul>
</li>
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../faq/index_cn.html">FAQ</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../faq/build_and_install/index_cn.html">编译安装与单元测试</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../faq/model/index_cn.html">模型配置</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../faq/parameter/index_cn.html">参数设置</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../faq/local/index_cn.html">本地训练与预测</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../faq/cluster/index_cn.html">集群训练与预测</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../mobile/index_cn.html">MOBILE</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../mobile/cross_compiling_for_android_cn.html">Android平台编译指南</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../mobile/cross_compiling_for_ios_cn.html">iOS平台编译指南</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../mobile/cross_compiling_for_raspberry_cn.html">Raspberry Pi平台编译指南</a></li>
</ul>
</li>
</ul>

        
    </nav>
    
    <section class="doc-content-wrap">

      

 







<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
      
211
    <li>Profiling the Python Code</li>
212 213 214 215 216 217 218 219
  </ul>
</div>
      
      <div class="wy-nav-content" id="doc-content">
        <div class="rst-content">
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
220
  <p>This tutorial introduces techniques we use to profile and tune the
221
CPU performance of PaddlePaddle.  We will use Python packages
222 223
<code class="docutils literal"><span class="pre">cProfile</span></code> and <code class="docutils literal"><span class="pre">yep</span></code>, and Google&#8217;s <code class="docutils literal"><span class="pre">perftools</span></code>.</p>
<p>Profiling is the process that reveals performance bottlenecks,
224
which could be very different from what&#8217;s in the developers&#8217; mind.
225
Performance tuning is done to fix these bottlenecks. Performance optimization
226
repeats the steps of profiling and tuning alternatively.</p>
227
<p>PaddlePaddle users program AI applications by calling the Python API, which calls
228 229 230 231 232 233 234 235 236 237 238 239 240
into <code class="docutils literal"><span class="pre">libpaddle.so.</span></code> written in C++.  In this tutorial, we focus on
the profiling and tuning of</p>
<ol class="simple">
<li>the Python code and</li>
<li>the mixture of Python and C++ code.</li>
</ol>
<div class="section" id="profiling-the-python-code">
<span id="profiling-the-python-code"></span><h1>Profiling the Python Code<a class="headerlink" href="#profiling-the-python-code" title="永久链接至标题"></a></h1>
<div class="section" id="generate-the-performance-profiling-file">
<span id="generate-the-performance-profiling-file"></span><h2>Generate the Performance Profiling File<a class="headerlink" href="#generate-the-performance-profiling-file" title="永久链接至标题"></a></h2>
<p>We can use Python standard
package, <a class="reference external" href="https://docs.python.org/2/library/profile.html"><code class="docutils literal"><span class="pre">cProfile</span></code></a>,
to generate Python profiling file.  For example:</p>
241 242 243
<div class="highlight-bash"><div class="highlight"><pre><span></span>python -m cProfile -o profile.out main.py
</pre></div>
</div>
244 245 246
<p>where <code class="docutils literal"><span class="pre">main.py</span></code> is the program we are going to profile, <code class="docutils literal"><span class="pre">-o</span></code> specifies
the output file.  Without <code class="docutils literal"><span class="pre">-o</span></code>, <code class="docutils literal"><span class="pre">cProfile</span></code> would outputs to standard
output.</p>
247
</div>
248 249 250 251 252
<div class="section" id="look-into-the-profiling-file">
<span id="look-into-the-profiling-file"></span><h2>Look into the Profiling File<a class="headerlink" href="#look-into-the-profiling-file" title="永久链接至标题"></a></h2>
<p><code class="docutils literal"><span class="pre">cProfile</span></code> generates <code class="docutils literal"><span class="pre">profile.out</span></code> after <code class="docutils literal"><span class="pre">main.py</span></code> completes. We can
use <a class="reference external" href="https://github.com/ymichael/cprofilev"><code class="docutils literal"><span class="pre">cprofilev</span></code></a> to look into
the details:</p>
253 254 255
<div class="highlight-bash"><div class="highlight"><pre><span></span>cprofilev -a <span class="m">0</span>.0.0.0 -p <span class="m">3214</span> -f profile.out main.py
</pre></div>
</div>
256 257 258 259 260 261 262 263 264
<p>where <code class="docutils literal"><span class="pre">-a</span></code> specifies the HTTP IP, <code class="docutils literal"><span class="pre">-p</span></code> specifies the port, <code class="docutils literal"><span class="pre">-f</span></code>
specifies the profiling file, and <code class="docutils literal"><span class="pre">main.py</span></code> is the source file.</p>
<p>Open the Web browser and points to the local IP and the specifies
port, we will see the output like the following:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span>   <span class="n">ncalls</span>  <span class="n">tottime</span>  <span class="n">percall</span>  <span class="n">cumtime</span>  <span class="n">percall</span> <span class="n">filename</span><span class="p">:</span><span class="n">lineno</span><span class="p">(</span><span class="n">function</span><span class="p">)</span>
        <span class="mi">1</span>    <span class="mf">0.284</span>    <span class="mf">0.284</span>   <span class="mf">29.514</span>   <span class="mf">29.514</span> <span class="n">main</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">1</span><span class="p">(</span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span><span class="p">)</span>
     <span class="mi">4696</span>    <span class="mf">0.128</span>    <span class="mf">0.000</span>   <span class="mf">15.748</span>    <span class="mf">0.003</span> <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">executor</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">20</span><span class="p">(</span><span class="n">run</span><span class="p">)</span>
     <span class="mi">4696</span>   <span class="mf">12.040</span>    <span class="mf">0.003</span>   <span class="mf">12.040</span>    <span class="mf">0.003</span> <span class="p">{</span><span class="n">built</span><span class="o">-</span><span class="ow">in</span> <span class="n">method</span> <span class="n">run</span><span class="p">}</span>
        <span class="mi">1</span>    <span class="mf">0.144</span>    <span class="mf">0.144</span>    <span class="mf">6.534</span>    <span class="mf">6.534</span> <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="fm">__init__</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">14</span><span class="p">(</span><span class="o">&lt;</span><span class="n">module</span><span class="o">&gt;</span><span class="p">)</span>
265 266
</pre></div>
</div>
267 268 269
<p>where each line corresponds to Python function, and the meaning of
each column is as follows:</p>
<p>| column | meaning |
270
| &#8212; | &#8212; |
271 272 273 274 275 276 277
| ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the
execution time of other functions called by the function |
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |
| filename:lineno(function) | where the function is defined |</p>
278
</div>
279 280 281 282
<div class="section" id="identify-performance-bottlenecks">
<span id="identify-performance-bottlenecks"></span><h2>Identify Performance Bottlenecks<a class="headerlink" href="#identify-performance-bottlenecks" title="永久链接至标题"></a></h2>
<p>Usually, <code class="docutils literal"><span class="pre">tottime</span></code> and the related <code class="docutils literal"><span class="pre">percall</span></code> time is what we want to
focus on. We can sort above profiling file by tottime:</p>
283 284 285 286 287 288 289
<div class="highlight-text"><div class="highlight"><pre><span></span>     4696   12.040    0.003   12.040    0.003 {built-in method run}
   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(&lt;module&gt;)
</pre></div>
</div>
290
<p>We can see that the most time-consuming function is the <code class="docutils literal"><span class="pre">built-in</span> <span class="pre">method</span> <span class="pre">run</span></code>, which is a C++ function in <code class="docutils literal"><span class="pre">libpaddle.so</span></code>.  We will
291
explain how to profile C++ code in the next section.  At this
292 293 294
moment, let&#8217;s look into the third function <code class="docutils literal"><span class="pre">sync_with_cpp</span></code>, which is a
Python function.  We can click it to understand more about it:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Called</span> <span class="n">By</span><span class="p">:</span>
295

296 297
   <span class="n">Ordered</span> <span class="n">by</span><span class="p">:</span> <span class="n">internal</span> <span class="n">time</span>
   <span class="n">List</span> <span class="n">reduced</span> <span class="kn">from</span> <span class="mi">4497</span> <span class="n">to</span> <span class="mi">2</span> <span class="n">due</span> <span class="n">to</span> <span class="n">restriction</span> <span class="o">&lt;</span><span class="s1">&#39;sync_with_cpp&#39;</span><span class="o">&gt;</span>
298

299 300 301 302 303
<span class="n">Function</span>                                                                                                 <span class="n">was</span> <span class="n">called</span> <span class="n">by</span><span class="o">...</span>
                                                                                                             <span class="n">ncalls</span>  <span class="n">tottime</span>  <span class="n">cumtime</span>
<span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">428</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>  <span class="o">&lt;-</span>    <span class="mi">4697</span>    <span class="mf">0.626</span>    <span class="mf">2.291</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">562</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>
<span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">562</span><span class="p">(</span><span class="n">sync_with_cpp</span><span class="p">)</span>  <span class="o">&lt;-</span>    <span class="mi">4696</span>    <span class="mf">0.019</span>    <span class="mf">2.316</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">487</span><span class="p">(</span><span class="n">clone</span><span class="p">)</span>
                                                                                                                  <span class="mi">1</span>    <span class="mf">0.000</span>    <span class="mf">0.001</span>  <span class="o">/</span><span class="n">home</span><span class="o">/</span><span class="n">yuyang</span><span class="o">/</span><span class="n">perf_test</span><span class="o">/.</span><span class="n">env</span><span class="o">/</span><span class="n">lib</span><span class="o">/</span><span class="n">python2</span><span class="o">.</span><span class="mi">7</span><span class="o">/</span><span class="n">site</span><span class="o">-</span><span class="n">packages</span><span class="o">/</span><span class="n">paddle</span><span class="o">/</span><span class="n">v2</span><span class="o">/</span><span class="n">fluid</span><span class="o">/</span><span class="n">framework</span><span class="o">.</span><span class="n">py</span><span class="p">:</span><span class="mi">534</span><span class="p">(</span><span class="n">append_backward</span><span class="p">)</span>
304 305


306
<span class="n">Called</span><span class="p">:</span>
307

308 309
   <span class="n">Ordered</span> <span class="n">by</span><span class="p">:</span> <span class="n">internal</span> <span class="n">time</span>
   <span class="n">List</span> <span class="n">reduced</span> <span class="kn">from</span> <span class="mi">4497</span> <span class="n">to</span> <span class="mi">2</span> <span class="n">due</span> <span class="n">to</span> <span class="n">restriction</span> <span class="o">&lt;</span><span class="s1">&#39;sync_with_cpp&#39;</span><span class="o">&gt;</span>
310 311
</pre></div>
</div>
312 313
<p>The lists of the callers of <code class="docutils literal"><span class="pre">sync_with_cpp</span></code> might help us understand
how to improve the function definition.</p>
314 315
</div>
</div>
316 317 318 319 320 321 322 323 324 325 326
<div class="section" id="profiling-python-and-c-code">
<span id="profiling-python-and-c-code"></span><h1>Profiling Python and C++ Code<a class="headerlink" href="#profiling-python-and-c-code" title="永久链接至标题"></a></h1>
<div class="section" id="generate-the-profiling-file">
<span id="generate-the-profiling-file"></span><h2>Generate the Profiling File<a class="headerlink" href="#generate-the-profiling-file" title="永久链接至标题"></a></h2>
<p>To profile a mixture of Python and C++ code, we can use a Python
package, <code class="docutils literal"><span class="pre">yep</span></code>, that can work with Google&#8217;s <code class="docutils literal"><span class="pre">perftools</span></code>, which is a
commonly-used profiler for C/C++ code.</p>
<p>In Ubuntu systems, we can install <code class="docutils literal"><span class="pre">yep</span></code> and <code class="docutils literal"><span class="pre">perftools</span></code> by running the
following commands:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>apt update
apt install libgoogle-perftools-dev
327 328 329
pip install yep
</pre></div>
</div>
330
<p>Then we can run the following command</p>
331 332 333
<div class="highlight-bash"><div class="highlight"><pre><span></span>python -m yep -v main.py
</pre></div>
</div>
334 335 336
<p>to generate the profiling file.  The default filename is
<code class="docutils literal"><span class="pre">main.py.prof</span></code>.</p>
<p>Please be aware of the <code class="docutils literal"><span class="pre">-v</span></code> command line option, which prints the
337 338
analysis results after generating the profiling file.  By examining the
the print result, we&#8217;d know that if we stripped debug
339 340
information from <code class="docutils literal"><span class="pre">libpaddle.so</span></code> at build time.  The following hints
help make sure that the analysis results are readable:</p>
341
<ol class="simple">
342 343 344 345 346 347 348 349 350 351 352 353
<li>Use GCC command line option <code class="docutils literal"><span class="pre">-g</span></code> when building <code class="docutils literal"><span class="pre">libpaddle.so</span></code> so to
include the debug information.  The standard building system of
PaddlePaddle is CMake, so you might want to set
<code class="docutils literal"><span class="pre">CMAKE_BUILD_TYPE=RelWithDebInfo</span></code>.</li>
<li>Use GCC command line option <code class="docutils literal"><span class="pre">-O2</span></code> or <code class="docutils literal"><span class="pre">-O3</span></code> to generate optimized
binary code. It doesn&#8217;t make sense to profile <code class="docutils literal"><span class="pre">libpaddle.so</span></code>
without optimization, because it would anyway run slowly.</li>
<li>Profiling the single-threaded binary file before the
multi-threading version, because the latter often generates tangled
profiling analysis result.  You might want to set environment
variable <code class="docutils literal"><span class="pre">OMP_NUM_THREADS=1</span></code> to prevents OpenMP from automatically
starting multiple threads.</li>
354 355
</ol>
</div>
356 357 358
<div class="section" id="examining-the-profiling-file">
<span id="examining-the-profiling-file"></span><h2>Examining the Profiling File<a class="headerlink" href="#examining-the-profiling-file" title="永久链接至标题"></a></h2>
<p>The tool we used to examine the profiling file generated by
359 360 361 362
<code class="docutils literal"><span class="pre">perftools</span></code> is <a class="reference external" href="https://github.com/google/pprof"><code class="docutils literal"><span class="pre">pprof</span></code></a>, which
provides a Web-based GUI like <code class="docutils literal"><span class="pre">cprofilev</span></code>.</p>
<p>We can rely on the standard Go toolchain to retrieve the source code
of <code class="docutils literal"><span class="pre">pprof</span></code> and build it:</p>
363 364 365
<div class="highlight-bash"><div class="highlight"><pre><span></span>go get github.com/google/pprof
</pre></div>
</div>
366 367
<p>Then we can use it to profile <code class="docutils literal"><span class="pre">main.py.prof</span></code> generated in the previous
section:</p>
368 369 370
<div class="highlight-bash"><div class="highlight"><pre><span></span>pprof -http<span class="o">=</span><span class="m">0</span>.0.0.0:3213 <span class="sb">`</span>which python<span class="sb">`</span>  ./main.py.prof
</pre></div>
</div>
371 372 373
<p>Where <code class="docutils literal"><span class="pre">-http</span></code> specifies the IP and port of the HTTP service.
Directing our Web browser to the service, we would see something like
the following:</p>
374 375
<p><img alt="result" src="../../_images/pprof_1.png" /></p>
</div>
376 377 378 379
<div class="section" id="identifying-the-performance-bottlenecks">
<span id="identifying-the-performance-bottlenecks"></span><h2>Identifying the Performance Bottlenecks<a class="headerlink" href="#identifying-the-performance-bottlenecks" title="永久链接至标题"></a></h2>
<p>Similar to how we work with <code class="docutils literal"><span class="pre">cprofilev</span></code>, we&#8217;d focus on <code class="docutils literal"><span class="pre">tottime</span></code> and
<code class="docutils literal"><span class="pre">cumtime</span></code>.</p>
380
<p><img alt="kernel_perf" src="../../_images/pprof_2.png" /></p>
381 382 383 384 385
<p>We can see that the execution time of multiplication and the computing
of the gradient of multiplication takes 2% to 4% of the total running
time, and <code class="docutils literal"><span class="pre">MomentumOp</span></code> takes about 17%. Obviously, we&#8217;d want to
optimize <code class="docutils literal"><span class="pre">MomentumOp</span></code>.</p>
<p><code class="docutils literal"><span class="pre">pprof</span></code> would mark performance critical parts of the program in
386
red. It&#8217;s a good idea to follow the hints.</p>
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
</div>
</div>


           </div>
          </div>
          <footer>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
            HAS_SOURCE:  true,
            SOURCELINK_SUFFIX: ".txt",
        };
    </script>
      <script type="text/javascript" src="../../_static/jquery.js"></script>
      <script type="text/javascript" src="../../_static/underscore.js"></script>
      <script type="text/javascript" src="../../_static/doctools.js"></script>
      <script type="text/javascript" src="../../_static/translations.js"></script>
      <script type="text/javascript" src="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"></script>
       
  

  
  
    <script type="text/javascript" src="../../_static/js/theme.js"></script>
  
  
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
  <script src="../../_static/js/paddle_doc_init.js"></script> 

</body>
</html>