py_data_provider_wrapper.html 19.5 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">


<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    
    <title>PyDataProviderWrapper API &mdash; PaddlePaddle  documentation</title>
    
    <link rel="stylesheet" href="../../_static/classic.css" type="text/css" />
    <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
    
    <script type="text/javascript">
      var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '../../',
        VERSION:     '',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  true
      };
    </script>
    <script type="text/javascript" src="../../_static/jquery.js"></script>
    <script type="text/javascript" src="../../_static/underscore.js"></script>
    <script type="text/javascript" src="../../_static/doctools.js"></script>
    <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
    <link rel="top" title="PaddlePaddle  documentation" href="../../index.html" />
    <link rel="up" title="User Interface" href="../index.html" />
    <link rel="next" title="Trainer Config Helpers" href="trainer_config_helpers/index.html" />
    <link rel="prev" title="Python Use Case" href="../data_provider/python_case.html" /> 
  </head>
  <body role="document">
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="../../genindex.html" title="General Index"
             accesskey="I">index</a></li>
        <li class="right" >
          <a href="../../py-modindex.html" title="Python Module Index"
             >modules</a> |</li>
        <li class="right" >
          <a href="trainer_config_helpers/index.html" title="Trainer Config Helpers"
             accesskey="N">next</a> |</li>
        <li class="right" >
          <a href="../data_provider/python_case.html" title="Python Use Case"
             accesskey="P">previous</a> |</li>
        <li class="nav-item nav-item-0"><a href="../../index.html">PaddlePaddle  documentation</a> &raquo;</li>
          <li class="nav-item nav-item-1"><a href="../index.html" accesskey="U">User Interface</a> &raquo;</li> 
      </ul>
    </div>  

    <div class="document">
      <div class="documentwrapper">
        <div class="bodywrapper">
          <div class="body" role="main">
            
  <div class="section" id="module-paddle.trainer.PyDataProviderWrapper">
<span id="pydataproviderwrapper-api"></span><h1>PyDataProviderWrapper API<a class="headerlink" href="#module-paddle.trainer.PyDataProviderWrapper" title="Permalink to this headline"></a></h1>
<p>This module provide a wrapper(decorator) to wrap a data process method into a
PyDataProvider. Some examples are shown <a class="reference external" href="data_provider/python_case.html">here</a>.</p>
<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.DenseSlot">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">DenseSlot</code><span class="sig-paren">(</span><em>dim</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.DenseSlot" title="Permalink to this definition"></a></dt>
<dd><p>Dense Slot Type: Each item is the value of a Dense Vector.</p>
<p>Its yield format for <code class="code docutils literal"><span class="pre">provider</span></code> is:</p>
<ul class="simple">
<li><strong>NonSeq</strong>: [float, float, ... ]</li>
<li><strong>Seq</strong>: [[float, float, ...], [float, float ....], ... ]</li>
<li><strong>SubSeq</strong>: [[[float, float, ...], [float ....], ...] ,                     [[float, float, ...], [float ....], ...] , ...]</li>
</ul>
</dd></dl>

<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">SparseNonValueSlot</code><span class="sig-paren">(</span><em>dim</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.SparseNonValueSlot" title="Permalink to this definition"></a></dt>
<dd><p>Sparse NonValue Slot Type: Each item is the id of a Sparse Vector.</p>
<p>Its yield format for <code class="code docutils literal"><span class="pre">provider</span></code> is:</p>
<ul class="simple">
<li><strong>NonSeq</strong>: [int, int, ...]</li>
<li><strong>Seq</strong>: [[int, int, ...], [int, int, ...], ... ]</li>
<li><strong>SubSeq</strong>: [[[int, int, ...], [int, ....], ...] ,                     [[int, int, ...], [int, ....], ...] , ...]</li>
</ul>
</dd></dl>

<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.SparseValueSlot">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">SparseValueSlot</code><span class="sig-paren">(</span><em>dim</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.SparseValueSlot" title="Permalink to this definition"></a></dt>
<dd><p>Sparse Value Slot Type: Each item is the id and value of a Sparse Vector.</p>
<p>Its yield format for <code class="code docutils literal"><span class="pre">provider</span></code> is:</p>
<ul class="simple">
<li><strong>NonSeq</strong>: [(int, float), (int, float), ... ]</li>
<li><strong>Seq</strong>: [[(int,float), (int, float), ... ],                 [(int, float), (int, float), ...], ... ]</li>
<li><strong>SubSeq</strong>: [[[(int,float), ...], [(int, float), ....], ...] ,                     [[(int,float), ...], [(int, float), ....], ...] , ...]</li>
</ul>
</dd></dl>

<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.IndexSlot">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">IndexSlot</code><span class="sig-paren">(</span><em>dim</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.IndexSlot" title="Permalink to this definition"></a></dt>
<dd><p>Index Value Slot Type: Each item is the id of Label.</p>
<p>Its yield format for <code class="code docutils literal"><span class="pre">provider</span></code> is:</p>
<ul class="simple">
<li><strong>NonSeq</strong>: int</li>
<li><strong>Seq</strong>:  [int, int, ....]</li>
<li><strong>SubSeq</strong>: [[int, int, ...], [int, int, ...], ... ]</li>
</ul>
</dd></dl>

<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.StringSlot">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">StringSlot</code><span class="sig-paren">(</span><em>dim</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.StringSlot" title="Permalink to this definition"></a></dt>
<dd><p>String Value Slot Type: Each item is a string for printout,                             can be used in DataLayer too.</p>
<p>Its yield format for <code class="code docutils literal"><span class="pre">provider</span></code> is:</p>
<ul class="simple">
<li><strong>NonSeq</strong>: string</li>
<li><strong>Seq</strong>: [string, string, ....]</li>
<li><strong>SubSeq</strong>:  [[string, string, ...], [string, string, ...], ... ]</li>
</ul>
</dd></dl>

<dl class="class">
<dt id="paddle.trainer.PyDataProviderWrapper.PoolSize">
<em class="property">class </em><code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">PoolSize</code><span class="sig-paren">(</span><em>pool_size</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.PoolSize" title="Permalink to this definition"></a></dt>
<dd><p>Max number of sample which contains in provider.</p>
</dd></dl>

<dl class="function">
<dt id="paddle.trainer.PyDataProviderWrapper.provider">
<code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">provider</code><span class="sig-paren">(</span><em>slots=None</em>, <em>use_seq=False</em>, <em>should_shuffle=True</em>, <em>pool_size=1</em>, <em>can_over_batch_size=True</em>, <em>calc_batch_size=&lt;function &lt;lambda&gt;&gt;</em>, <em>debug=False</em>, <em>init_hook=&lt;function default_init_hook&gt;</em>, <em>profile_filename=None</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.provider" title="Permalink to this definition"></a></dt>
<dd><p>The decorator for PyDataProvider. User should use this to create Provider class.
User should only concern how to read sample from file.</p>
<p>So the basic usage is:</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="nd">@provider</span><span class="p">(</span><span class="n">some</span> <span class="n">data</span> <span class="n">provider</span> <span class="n">config</span> <span class="n">here</span><span class="o">...</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">process</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">file_name</span><span class="p">):</span>
    <span class="k">while</span> <span class="ow">not</span> <span class="n">at</span> <span class="n">end</span> <span class="n">of</span> <span class="n">file_name</span><span class="p">:</span>
        <span class="n">sample</span> <span class="o">=</span> <span class="n">readOneSampleFromFile</span><span class="p">(</span><span class="n">file_name</span><span class="p">)</span>
        <span class="k">yield</span> <span class="n">sample</span><span class="o">.</span>
</pre></div>
</div>
<p>The configuration of data provider should be setup by:</p>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><ul class="first last simple">
<li><strong>init_hook</strong> (<em>callable</em>) &#8211; <p>A callback will be invoked when PyDataProvider instance                       created. The parameter is (obj, *args, **kwargs).</p>
<ul>
<li><strong>obj</strong>: actually data provider instance, which                                  contains some global objects in obj.xxxxx,                                  and is used by process function.<ol class="arabic">
<li><strong>obj.slots</strong>: a list of SlotType Object. Can be                                           set in init. For example, obj.slots =                                           [DenseSlot(9), IndexSlot(2)].</li>
<li><strong>obj.logger</strong>: a logger object. User can invoke                                           obj.logger.info(), obj.logger.fatal(), etc.</li>
</ol>
</li>
<li><strong>args</strong> and <strong>kwargs</strong>: the data provider __init__                                                  parameters. For example, load_data_args                                                  will be found in **kwargs,                                                  and if you want to recieve                                                  it from trainer_config,                                                  recommand to use init_hook_wrapper</li>
</ul>
</li>
<li><strong>pool_size</strong> (<em>int | PoolSize</em>) &#8211; <ul>
<li><strong>int</strong>: it will read at most pool_size files to memory.</li>
<li><strong>PoolSize</strong>: it will read at most PoolSize.size samples to memory.</li>
<li>If not set, it will read all the files to memory.</li>
</ul>
</li>
<li><strong>slots</strong> (<em>list | callable</em>) &#8211; <p>Specify the SlotTypes, can also be set in init_hook. It has two formats:</p>
<ul>
<li>A list of SlotType objects. For example, slots =                     [DenseSlot(9), IndexSlot(2)].</li>
<li>A method return a list of SlotTypes, and the parameter of                     method is (obj, *file_list, **kwargs).</li>
</ul>
</li>
<li><strong>use_seq</strong> (<em>bool</em>) &#8211; <p>False if use no sequence (Default). True if use sequence:</p>
<ul>
<li>If sequence has <strong>no sub-sequence</strong>: Each slot will                        return a list of data. This list is one sequence.                        So the return format likes                        [[a0, a1, a2], [b1, b2, b3, b4], [c1]].</li>
<li>If sequence has <strong>sub-sequence</strong>: Each slot will return                        a nested-list of data. This list contains several                        sub-lists, each sub-list is one sub-sequence.                        So the return format likes                        [[[a0, a1, a2], [a4, a5]], [[b1, b2, b3, b4], [b5, b6]], [[c1], [c2]]].</li>
</ul>
</li>
<li><strong>should_shuffle</strong> (<em>bool</em>) &#8211; True if data should shuffle.</li>
<li><strong>calc_batch_size</strong> (<em>callable</em>) &#8211; <p>The method calculate each data&#8217;s batch size.</p>
<ul>
<li>Default is the batch size of one sample.</li>
<li>User can customize by <strong>lamda</strong> funtion. For example,                               <code class="code docutils literal"><span class="pre">calc_batch_size</span> <span class="pre">=</span> <span class="pre">lambda</span> <span class="pre">data</span> <span class="pre">:</span> <span class="pre">len(data)</span></code>                               means calculating the token number of a sequence data.</li>
</ul>
</li>
<li><strong>can_over_batch_size</strong> (<em>bool</em>) &#8211; <p>Whether <code class="code docutils literal"><span class="pre">actual</span> <span class="pre">batch</span> <span class="pre">size</span> <span class="pre">&gt;=</span> <span class="pre">input</span> <span class="pre">batch</span> <span class="pre">size</span></code></p>
<ul>
<li><strong>True</strong> (&gt;=): getNextBatch method can return more data (Default).</li>
<li><strong>False</strong> (&lt;): user must ensure that each data&#8217;s batch size &lt; input batch size.</li>
</ul>
</li>
<li><strong>debug</strong> (<em>bool</em>) &#8211; True if enable debug logger and some debug check. Default is False.</li>
<li><strong>profile_filename</strong> (<em>None | Str</em>) &#8211; None if disable profile (Default). Otherwise,                              the data provider will dump profile result when                              reset. And the dump filename is                              <strong>&lt;profile_filename&gt;_&lt;reset_count&gt;</strong>.</li>
</ul>
</td>
</tr>
</tbody>
</table>
</dd></dl>

<dl class="function">
<dt id="paddle.trainer.PyDataProviderWrapper.init_hook_wrapper">
<code class="descclassname">paddle.trainer.PyDataProviderWrapper.</code><code class="descname">init_hook_wrapper</code><span class="sig-paren">(</span><em>func</em><span class="sig-paren">)</span><a class="headerlink" href="#paddle.trainer.PyDataProviderWrapper.init_hook_wrapper" title="Permalink to this definition"></a></dt>
<dd><p>Wrap a method for PyDataProviderWrapper&#8217;s init_hook. This method can
receive parameter from trainer_config&#8217;s load_data_args. The load_data_args
must pass a pickle.dumps() value, and dump a map as keyword args. The
wrapped method <code class="code docutils literal"><span class="pre">func</span></code> will receive them as keyword args.</p>
<p>So an example usage is:</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="nd">@init_hook_wrapper</span>
<span class="k">def</span> <span class="nf">hook</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">dictionary</span><span class="p">,</span> <span class="n">file_list</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
    <span class="n">obj</span><span class="o">.</span><span class="n">dictionary</span> <span class="o">=</span> <span class="n">dictionary</span>
    <span class="n">obj</span><span class="o">.</span><span class="n">slots</span> <span class="o">=</span> <span class="p">[</span><span class="n">IndexSlot</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">obj</span><span class="o">.</span><span class="n">dictionary</span><span class="p">)),</span>
                 <span class="n">IndexSlot</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="nb">open</span><span class="p">(</span><span class="n">file_list</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="s2">&quot;r&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">readlines</span><span class="p">()))]</span>
</pre></div>
</div>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">Parameters:</th><td class="field-body"><strong>func</strong> (<em>callable</em>) &#8211; init_hook function</td>
</tr>
<tr class="field-even field"><th class="field-name">Returns:</th><td class="field-body">wrapped method, can be passed into &#64;provider.</td>
</tr>
</tbody>
</table>
</dd></dl>

</div>


          </div>
        </div>
      </div>
      <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
        <div class="sphinxsidebarwrapper">
  <h4>Previous topic</h4>
  <p class="topless"><a href="../data_provider/python_case.html"
                        title="previous chapter">Python Use Case</a></p>
  <h4>Next topic</h4>
  <p class="topless"><a href="trainer_config_helpers/index.html"
                        title="next chapter">Trainer Config Helpers</a></p>
  <div role="note" aria-label="source link">
    <h3>This Page</h3>
    <ul class="this-page-menu">
      <li><a href="../../_sources/ui/api/py_data_provider_wrapper.txt"
            rel="nofollow">Show Source</a></li>
    </ul>
   </div>
<div id="searchbox" style="display: none" role="search">
  <h3>Quick search</h3>
    <form class="search" action="../../search.html" method="get">
      <input type="text" name="q" />
      <input type="submit" value="Go" />
      <input type="hidden" name="check_keywords" value="yes" />
      <input type="hidden" name="area" value="default" />
    </form>
    <p class="searchtip" style="font-size: 90%">
    Enter search terms or a module, class or function name.
    </p>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
        </div>
      </div>
      <div class="clearer"></div>
    </div>
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="../../genindex.html" title="General Index"
             >index</a></li>
        <li class="right" >
          <a href="../../py-modindex.html" title="Python Module Index"
             >modules</a> |</li>
        <li class="right" >
          <a href="trainer_config_helpers/index.html" title="Trainer Config Helpers"
             >next</a> |</li>
        <li class="right" >
          <a href="../data_provider/python_case.html" title="Python Use Case"
             >previous</a> |</li>
        <li class="nav-item nav-item-0"><a href="../../index.html">PaddlePaddle  documentation</a> &raquo;</li>
          <li class="nav-item nav-item-1"><a href="../index.html" >User Interface</a> &raquo;</li> 
      </ul>
    </div>
    <div class="footer" role="contentinfo">
        &copy; Copyright 2016, PaddlePaddle developers.
      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.3.5.
    </div>
  </body>
</html>