index_cn.html 40.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549


<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>情感分析教程 &mdash; PaddlePaddle  文档</title>
  

  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
  

  
  
        <link rel="index" title="索引"
              href="../../genindex.html"/>
        <link rel="search" title="搜索" href="../../search.html"/>
    <link rel="top" title="PaddlePaddle  文档" href="../../index.html"/> 

  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/override.css" type="text/css" />
  <script>
  var _hmt = _hmt || [];
  (function() {
    var hm = document.createElement("script");
    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
    var s = document.getElementsByTagName("script")[0]; 
    s.parentNode.insertBefore(hm, s);
  })();
  </script>

  

  
  <script src="../../_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

  
  <header class="site-header">
    <div class="site-logo">
      <a href="/"><img src="../../_static/images/PP_w.png"></a>
    </div>
    <div class="site-nav-links">
      <div class="site-menu">
        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
        <div class="language-switcher dropdown">
          <a type="button" data-toggle="dropdown">
            <span>English</span>
            <i class="fa fa-angle-up"></i>
            <i class="fa fa-angle-down"></i>
          </a>
          <ul class="dropdown-menu">
            <li><a href="/doc_cn">中文</a></li>
            <li><a href="/doc">English</a></li>
          </ul>
        </div>
        <ul class="site-page-links">
          <li><a href="/">Home</a></li>
        </ul>
      </div>
      <div class="doc-module">
        
        <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_cn.html">新手入门</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../howto/index_cn.html">进阶指南</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/index_cn.html">API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../faq/index_cn.html">FAQ</a></li>
</ul>

        
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>        
      </div>
    </div>
  </header>
  
  <div class="main-content-wrap">

    
    <nav class="doc-menu-vertical" role="navigation">
        
          
          <ul>
<li class="toctree-l1"><a class="reference internal" href="../../getstarted/index_cn.html">新手入门</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../getstarted/build_and_install/index_cn.html">安装与编译</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/docker_install_cn.html">PaddlePaddle的Docker容器使用方式</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/ubuntu_install_cn.html">Ubuntu部署PaddlePaddle</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../getstarted/build_and_install/cmake/build_from_source_cn.html">PaddlePaddle的编译选项</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../getstarted/concepts/use_concepts_cn.html">基本使用概念</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../howto/index_cn.html">进阶指南</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../howto/usage/cmd_parameter/index_cn.html">设置命令行参数</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../howto/usage/cmd_parameter/use_case_cn.html">使用案例</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../howto/usage/cmd_parameter/arguments_cn.html">参数概述</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../howto/usage/cmd_parameter/detail_introduction_cn.html">细节描述</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/usage/cluster/cluster_train_cn.html">运行分布式训练</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/usage/k8s/k8s_basis_cn.html">Kubernetes 简介</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/usage/k8s/k8s_cn.html">Kubernetes单机训练</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/usage/k8s/k8s_distributed_cn.html">Kubernetes分布式训练</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/dev/write_docs_cn.html">如何贡献/修改文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/dev/contribute_to_paddle_cn.html">如何贡献代码</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/deep_model/rnn/index_cn.html">RNN相关模型</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../howto/deep_model/rnn/recurrent_group_cn.html">Recurrent Group教程</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../howto/deep_model/rnn/hierarchical_layer_cn.html">支持双层序列作为输入的Layer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../howto/deep_model/rnn/hrnn_rnn_api_compare_cn.html">单双层RNN API对比介绍</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../howto/optimization/gpu_profiling_cn.html">GPU性能分析与调优</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../api/index_cn.html">API</a><ul>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/model_configs.html">模型配置</a><ul>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/activation.html">Activation</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/layer.html">Layers</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/optimizer.html">Optimizer</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/pooling.html">Pooling</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/networks.html">Networks</a></li>
<li class="toctree-l3"><a class="reference internal" href="../../api/v2/config/attr.html">Parameter Attribute</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/data.html">数据访问</a></li>
<li class="toctree-l2"><a class="reference internal" href="../../api/v2/run_logic.html">训练与应用</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../faq/index_cn.html">FAQ</a></li>
</ul>

        
    </nav>
    
    <section class="doc-content-wrap">

      

 







<div role="navigation" aria-label="breadcrumbs navigation">
  <ul class="wy-breadcrumbs">
      
    <li>情感分析教程</li>
  </ul>
</div>
      
      <div class="wy-nav-content" id="doc-content">
        <div class="rst-content">
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="">
<span id="id1"></span><h1>情感分析教程<a class="headerlink" href="#" title="永久链接至标题"></a></h1>
<p>情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性,给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如:把用户在购物网站、旅游网站、团购网站(亚马逊、天猫、淘宝等)上发表的评论分成正面评论和负面评论两类。</p>
<p>情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如,研究人员分析了几个关于消费者信心和政治观点的调查,结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。</p>
<p>另一方面,抓取产品的用户评论并分析他们的情感,有助于理解用户对不同公司,不同产品,甚至不同竞争对手产品的偏好。</p>
<p>本教程将指导您完成长期短期记忆(LSTM)网络的训练过程,以分类来自<a class="reference external" href="http://ai.stanford.edu/~amaas/data/sentiment/">大型电影评论数据集</a>(有时称为<a class="reference external" href="http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf">互联网电影数据库 (IMDB)</a>)的句子的情感 。 此数据集包含电影评论及其相关联的类别标签,即正面和负面。</p>
<div class="section" id="">
<span id="id2"></span><h2>数椐准备<a class="headerlink" href="#" title="永久链接至标题"></a></h2>
<div class="section" id="imdb">
<span id="imdb"></span><h3>IMDB 数椐介绍<a class="headerlink" href="#imdb" title="永久链接至标题"></a></h3>
<p>训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和<a class="reference external" href="http://www.statmt.org/moses/">Moses</a>工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本,它不仅能够处理IMDB数据,还能处理其他用户自定义的数据。 为了使用提前编写的脚本,需要将标记的训练和测试样本移动到另一个路径,这已经在<code class="docutils literal"><span class="pre">get_imdb.sh</span></code>中完成。</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">demo</span><span class="o">/</span><span class="n">sentiment</span><span class="o">/</span><span class="n">data</span>
<span class="o">./</span><span class="n">get_imdb</span><span class="o">.</span><span class="n">sh</span>
</pre></div>
</div>
<p>如果数椐获取成功,你将在目录<code class="docutils literal"><span class="pre">./demo/sentiment/data</span></code>中看到下面的文件:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">aclImdb</span>  <span class="n">get_imdb</span><span class="o">.</span><span class="n">sh</span>  <span class="n">imdb</span>  <span class="n">mosesdecoder</span><span class="o">-</span><span class="n">master</span>
</pre></div>
</div>
<ul class="simple">
<li>aclImdb: 从外部网站上下载的原始数椐集。</li>
<li>imdb: 仅包含训练和测试数椐集。</li>
<li>mosesdecoder-master: Moses 工具。</li>
</ul>
<p>IMDB数据集包含25,000个已标注过的高极性电影评论用于训练,25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7,总评分10分。 运行完脚本 <code class="docutils literal"><span class="pre">./get_imdb.sh</span></code>后, 我们可以看到在目录 <code class="docutils literal"><span class="pre">aclImdb</span></code>中的数椐集的结构如下:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">imdbEr</span><span class="o">.</span><span class="n">txt</span>  <span class="n">imdb</span><span class="o">.</span><span class="n">vocab</span>  <span class="n">README</span>  <span class="n">test</span>  <span class="n">train</span>
</pre></div>
</div>
<ul class="simple">
<li>train: 训练数椐集。</li>
<li>test : 测试数椐集。</li>
<li>imdb.vocab: 字典文件。</li>
<li>imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。</li>
<li>README: 数椐说明文档。</li>
</ul>
<p>测试集和训练集目录包含下面的文件:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">labeledBow</span><span class="o">.</span><span class="n">feat</span>  <span class="n">neg</span>  <span class="n">pos</span>  <span class="n">unsup</span>  <span class="n">unsupBow</span><span class="o">.</span><span class="n">feat</span>  <span class="n">urls_neg</span><span class="o">.</span><span class="n">txt</span>  <span class="n">urls_pos</span><span class="o">.</span><span class="n">txt</span>  <span class="n">urls_unsup</span><span class="o">.</span><span class="n">txt</span>
</pre></div>
</div>
<ul class="simple">
<li>pos: 正面评价样本,包含12,500个txt文件,每个文件是一个电影评论。</li>
<li>neg: 负面评价样本,包含12,500个txt文件,每个文件是一个电影评论。</li>
<li>unsup: 未标记的评价样本,包含50,000个txt文件。</li>
<li>urls_xx.txt: 每个评论的网址。</li>
<li>xxBow.feat: 用于统计词频的Bow模型特征。</li>
</ul>
</div>
<div class="section" id="imdb">
<span id="id3"></span><h3>IMDB 数椐准备<a class="headerlink" href="#imdb" title="永久链接至标题"></a></h3>
<p>在这个例子中,我们只使用已经标注过的训练集和测试集,且默认在训练集上构建字典,而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本<code class="docutils literal"><span class="pre">tokenizer.perl</span></code> 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">demo</span><span class="o">/</span><span class="n">sentiment</span><span class="o">/</span>
<span class="o">./</span><span class="n">preprocess</span><span class="o">.</span><span class="n">sh</span>
</pre></div>
</div>
<p>preprocess.sh:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">data_dir</span><span class="o">=</span><span class="s2">&quot;./data/imdb&quot;</span>
<span class="n">python</span> <span class="n">preprocess</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span><span class="n">i</span> <span class="n">data_dir</span>
</pre></div>
</div>
<ul class="simple">
<li>data_dir: 输入数椐所在目录。</li>
<li>preprocess.py: 预处理脚本。</li>
</ul>
<p>运行成功后目录<code class="docutils literal"><span class="pre">demo/sentiment/data/pre-imdb</span></code> 结构如下:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="nb">dict</span><span class="o">.</span><span class="n">txt</span>  <span class="n">labels</span><span class="o">.</span><span class="n">list</span>  <span class="n">test</span><span class="o">.</span><span class="n">list</span>  <span class="n">test_part_000</span>  <span class="n">train</span><span class="o">.</span><span class="n">list</span>  <span class="n">train_part_000</span>
</pre></div>
</div>
<ul class="simple">
<li>test_part_000 and train_part_000: 所有标记的测试集和训练集, 训练集已经随机打乱。</li>
<li>train.list and test.list: 训练集和测试集文件列表。</li>
<li>dict.txt: 利用训练集生成的字典。</li>
<li>labels.txt: neg  0, pos 1, 含义:标签0表示负面的评论,标签1表示正面的评论。</li>
</ul>
</div>
<div class="section" id="">
<span id="id4"></span><h3>用户自定义数椐预处理<a class="headerlink" href="#" title="永久链接至标题"></a></h3>
<p>如果你执行其它的用情感分析来分类文本的任务,可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">dataset</span>
<span class="o">|----</span><span class="n">train</span>
<span class="o">|</span>    <span class="o">|----</span><span class="n">class1</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">|----</span><span class="n">text_files</span>
<span class="o">|</span>    <span class="o">|----</span><span class="n">class2</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">|----</span><span class="n">text_files</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">...</span>
<span class="o">|----</span><span class="n">test</span>
<span class="o">|</span>    <span class="o">|----</span><span class="n">class1</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">|----</span><span class="n">text_files</span>
<span class="o">|</span>    <span class="o">|----</span><span class="n">class2</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">|----</span><span class="n">text_files</span>
<span class="o">|</span>    <span class="o">|</span>    <span class="o">...</span>
</pre></div>
</div>
<ul class="simple">
<li>dataset: 一级目录。</li>
<li>train, test: 二级目录。</li>
<li>class1,class2,...: 三级目录。</li>
<li>text_files: 文本格式的实例文件。</li>
</ul>
<p>所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例,每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行<code class="docutils literal"><span class="pre">preprocess.sh</span></code>脚本时需要加上<code class="docutils literal"><span class="pre">-m</span> <span class="pre">True</span></code>参数。 tokenizer.perl 默认用来切分单记和标点符号,如果你不需要这个操作,在运行<code class="docutils literal"><span class="pre">preprocess.sh</span></code>时加上<code class="docutils literal"><span class="pre">-t</span> <span class="pre">False</span></code>参数即可。</p>
</div>
</div>
<div class="section" id="">
<span id="id5"></span><h2>训练模型<a class="headerlink" href="#" title="永久链接至标题"></a></h2>
<p>在这步任务中,我们使用了循环神经网络(RNN)的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元,忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息,而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内,存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。</p>
<p><center><img alt="LSTM" src="../../_images/lstm.png" /></center>
<center>图表 1. LSTM [3]</center></p>
<p>情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ,仅仅是一些关键词,如形容词和副词,在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长,例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先,它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二,它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三,它直接学习段落表示,而不是组合上下文级别信息。</p>
<p>在本演示中,我们提供两个网络,即双向LSTM和三层堆叠LSTM。</p>
<div class="section" id="lstm">
<span id="lstm"></span><h3>双向LSTM<a class="headerlink" href="#lstm" title="永久链接至标题"></a></h3>
<p>图2是双向LSTM网络,后面连全连接层和softmax层。</p>
<p><center><img alt="BiLSTM" src="../../_images/bi_lstm1.jpg" /></center>
<center>图 2. Bidirectional-LSTM </center></p>
</div>
<div class="section" id="stacked-lstm">
<span id="stacked-lstm"></span><h3>Stacked-LSTM<a class="headerlink" href="#stacked-lstm" title="永久链接至标题"></a></h3>
<p>图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来,连接三个LSTM隐藏层,并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后,使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。</p>
<p><center><img alt="StackedLSTM" src="../../_images/stacked_lstm.jpg" /></center>
<center>图 3. Stacked-LSTM for sentiment analysis </center></p>
<p><strong>配置</strong></p>
<p>进入<code class="docutils literal"><span class="pre">demo/sentiment</span></code> 目录 , <code class="docutils literal"><span class="pre">trainer_config.py</span></code> 是一个配置文件的例子, 其中包含算法和网络配置。第一行从<code class="docutils literal"><span class="pre">sentiment_net.py</span></code>中导出预定义的网络。</p>
<p>trainer_config.py:</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">sentiment_net</span> <span class="kn">import</span> <span class="o">*</span>

<span class="n">data_dir</span>  <span class="o">=</span> <span class="s2">&quot;./data/pre-imdb&quot;</span>
<span class="c1"># whether this config is used for test</span>
<span class="n">is_test</span> <span class="o">=</span> <span class="n">get_config_arg</span><span class="p">(</span><span class="s1">&#39;is_test&#39;</span><span class="p">,</span> <span class="nb">bool</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span>
<span class="c1"># whether this config is used for prediction</span>
<span class="n">is_predict</span> <span class="o">=</span> <span class="n">get_config_arg</span><span class="p">(</span><span class="s1">&#39;is_predict&#39;</span><span class="p">,</span> <span class="nb">bool</span><span class="p">,</span> <span class="bp">False</span><span class="p">)</span>
<span class="n">dict_dim</span><span class="p">,</span> <span class="n">class_dim</span> <span class="o">=</span> <span class="n">sentiment_data</span><span class="p">(</span><span class="n">data_dir</span><span class="p">,</span> <span class="n">is_test</span><span class="p">,</span> <span class="n">is_predict</span><span class="p">)</span>

<span class="c1">################## Algorithm Config #####################</span>

<span class="n">settings</span><span class="p">(</span>
  <span class="n">batch_size</span><span class="o">=</span><span class="mi">128</span><span class="p">,</span>
  <span class="n">learning_rate</span><span class="o">=</span><span class="mf">2e-3</span><span class="p">,</span>
  <span class="n">learning_method</span><span class="o">=</span><span class="n">AdamOptimizer</span><span class="p">(),</span>
  <span class="n">regularization</span><span class="o">=</span><span class="n">L2Regularization</span><span class="p">(</span><span class="mf">8e-4</span><span class="p">),</span>
  <span class="n">gradient_clipping_threshold</span><span class="o">=</span><span class="mi">25</span>
<span class="p">)</span>

<span class="c1">#################### Network Config ######################</span>
<span class="n">stacked_lstm_net</span><span class="p">(</span><span class="n">dict_dim</span><span class="p">,</span> <span class="n">class_dim</span><span class="o">=</span><span class="n">class_dim</span><span class="p">,</span>
                 <span class="n">stacked_num</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">is_predict</span><span class="o">=</span><span class="n">is_predict</span><span class="p">)</span>
<span class="c1">#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)</span>
</pre></div>
</div>
<ul class="simple">
<li><strong>数椐定义</strong>:<ul>
<li>get_config_arg(): 获取通过 <code class="docutils literal"><span class="pre">--config_args=xx</span></code> 设置的命令行参数。</li>
<li>定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档</li>
</ul>
</li>
<li><strong>算法配置</strong>:<ul>
<li>使用随机梯度下降(sgd)算法。</li>
<li>使用 adam 优化。</li>
<li>设置batch size大小为128。</li>
<li>设置平均sgd窗口。</li>
<li>设置全局学习率。</li>
</ul>
</li>
<li><strong>网络配置</strong>:<ul>
<li>dict_dim: 获取字典维度。</li>
<li>class_dim: 设置类别数,IMDB有两个标签,即正面评价标签和负面评价标签。</li>
<li><code class="docutils literal"><span class="pre">stacked_lstm_net</span></code>: 预定义网络如图3所示,默认情况下使用此网络</li>
<li><code class="docutils literal"><span class="pre">bidirectional_lstm_net</span></code>: 预定义网络,如图2所示。</li>
</ul>
</li>
</ul>
<p><strong>训练</strong></p>
<p>首先安装PaddlePaddle。 然后使用下面的脚本 <code class="docutils literal"><span class="pre">train.sh</span></code> 来开启本地的训练。</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">demo</span><span class="o">/</span><span class="n">sentiment</span><span class="o">/</span>
<span class="o">./</span><span class="n">train</span><span class="o">.</span><span class="n">sh</span>
</pre></div>
</div>
<p>train.sh:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span>config=trainer_config.py
output=./model_output
paddle train --config=$config \
             --save_dir=$output \
             --job=train \
             --use_gpu=false \
             --trainer_count=4 \
             --num_passes=10 \
             --log_period=20 \
             --dot_period=20 \
             --show_parameter_stats_period=100 \
             --test_all_data_in_one_period=1 \
             2&gt;&amp;1 | tee &#39;train.log&#39;
</pre></div>
</div>
<ul class="simple">
<li>--config=$config: 设置网络配置。</li>
<li>--save_dir=$output: 设置输出路径以保存训练完成的模型。</li>
<li>--job=train: 设置工作模式为训练。</li>
<li>--use_gpu=false: 使用CPU训练,如果你安装GPU版本的PaddlePaddle,并想使用GPU来训练设置为true。</li>
<li>--trainer_count=4:设置线程数(或GPU个数)。</li>
<li>--num_passes=15: 设置pass,PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。</li>
<li>--log_period=20: 每20个batch打印一次日志。</li>
<li>--show_parameter_stats_period=100: 每100个batch打印一次统计信息。</li>
<li>--test_all_data_in_one_period=1: 每次测试都测试所有数据。</li>
</ul>
<p>如果运行成功,输出日志保存在路径 <code class="docutils literal"><span class="pre">demo/sentiment/train.log</span></code>中,模型保存在目录<code class="docutils literal"><span class="pre">demo/sentiment/model_output/</span></code>中。  输出日志说明如下:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Batch</span><span class="o">=</span><span class="mi">20</span> <span class="n">samples</span><span class="o">=</span><span class="mi">2560</span> <span class="n">AvgCost</span><span class="o">=</span><span class="mf">0.681644</span> <span class="n">CurrentCost</span><span class="o">=</span><span class="mf">0.681644</span> <span class="n">Eval</span><span class="p">:</span> <span class="n">classification_error_evaluator</span><span class="o">=</span><span class="mf">0.36875</span>  <span class="n">CurrentEval</span><span class="p">:</span> <span class="n">classification_error_evaluator</span><span class="o">=</span><span class="mf">0.36875</span>
<span class="o">...</span>
<span class="n">Pass</span><span class="o">=</span><span class="mi">0</span> <span class="n">Batch</span><span class="o">=</span><span class="mi">196</span> <span class="n">samples</span><span class="o">=</span><span class="mi">25000</span> <span class="n">AvgCost</span><span class="o">=</span><span class="mf">0.418964</span> <span class="n">Eval</span><span class="p">:</span> <span class="n">classification_error_evaluator</span><span class="o">=</span><span class="mf">0.1922</span>
<span class="n">Test</span> <span class="n">samples</span><span class="o">=</span><span class="mi">24999</span> <span class="n">cost</span><span class="o">=</span><span class="mf">0.39297</span> <span class="n">Eval</span><span class="p">:</span> <span class="n">classification_error_evaluator</span><span class="o">=</span><span class="mf">0.149406</span>
</pre></div>
</div>
<ul class="simple">
<li>Batch=xx: 表示训练了xx个Batch。</li>
<li>samples=xx: 表示训练了xx个样本。。</li>
<li>AvgCost=xx: 从第0个batch到当前batch的平均损失。</li>
<li>CurrentCost=xx: 最新log_period个batch处理的当前损失。</li>
<li>Eval: classification_error_evaluator=xx: 表示第0个batch到当前batch的分类错误。</li>
<li>CurrentEval: classification_error_evaluator: 最新log_period个batch的分类错误。</li>
<li>Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。</li>
</ul>
<p>默认情况下,我们使用<code class="docutils literal"><span class="pre">stacked_lstm_net</span></code>网络,当传递相同的样本数时,它的收敛速度比<code class="docutils literal"><span class="pre">bidirectional_lstm_net</span></code>快。如果要使用双向LSTM,只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。</p>
</div>
</div>
<div class="section" id="">
<span id="id6"></span><h2>测试模型<a class="headerlink" href="#" title="永久链接至标题"></a></h2>
<p>测试模型是指使用训练出的模型评估已标记的验证集。</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">demo</span><span class="o">/</span><span class="n">sentiment</span>
<span class="o">./</span><span class="n">test</span><span class="o">.</span><span class="n">sh</span>
</pre></div>
</div>
<p>test.sh:</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="k">function</span> get_best_pass<span class="o">()</span> <span class="o">{</span>
  cat <span class="nv">$1</span>  <span class="p">|</span> grep -Pzo <span class="s1">&#39;Test .*\n.*pass-.*&#39;</span> <span class="p">|</span> <span class="se">\</span>
  sed  -r <span class="s1">&#39;N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g&#39;</span> <span class="p">|</span> <span class="se">\</span>
  sort <span class="p">|</span> head -n <span class="m">1</span>
<span class="o">}</span>

<span class="nv">log</span><span class="o">=</span>train.log
<span class="nv">LOG</span><span class="o">=</span><span class="sb">`</span>get_best_pass <span class="nv">$log</span><span class="sb">`</span>
<span class="nv">LOG</span><span class="o">=(</span><span class="si">${</span><span class="nv">LOG</span><span class="si">}</span><span class="o">)</span>
<span class="nv">evaluate_pass</span><span class="o">=</span><span class="s2">&quot;model_output/pass-</span><span class="si">${</span><span class="nv">LOG</span><span class="p">[1]</span><span class="si">}</span><span class="s2">&quot;</span>

<span class="nb">echo</span> <span class="s1">&#39;evaluating from pass &#39;</span><span class="nv">$evaluate_pass</span>

<span class="nv">model_list</span><span class="o">=</span>./model.list
touch <span class="nv">$model_list</span> <span class="p">|</span> <span class="nb">echo</span> <span class="nv">$evaluate_pass</span> &gt; <span class="nv">$model_list</span>
<span class="nv">net_conf</span><span class="o">=</span>trainer_config.py
paddle train --config<span class="o">=</span><span class="nv">$net_conf</span> <span class="se">\</span>
             --model_list<span class="o">=</span><span class="nv">$model_list</span> <span class="se">\</span>
             --job<span class="o">=</span><span class="nb">test</span> <span class="se">\</span>
             --use_gpu<span class="o">=</span><span class="nb">false</span> <span class="se">\</span>
             --trainer_count<span class="o">=</span><span class="m">4</span> <span class="se">\</span>
             --config_args<span class="o">=</span><span class="nv">is_test</span><span class="o">=</span><span class="m">1</span> <span class="se">\</span>
             <span class="m">2</span>&gt;<span class="p">&amp;</span><span class="m">1</span> <span class="p">|</span> tee <span class="s1">&#39;test.log&#39;</span>
</pre></div>
</div>
<p>函数<code class="docutils literal"><span class="pre">get_best_pass</span></code>依据分类错误率获得最佳模型进行测试。 在本示例中,我们默认使用IMDB的测试数据集作为验证。 与训练不同,它需要在这里指定<code class="docutils literal"><span class="pre">--job</span> <span class="pre">=</span> <span class="pre">test</span></code>和模型路径,即<code class="docutils literal"><span class="pre">--model_list</span> <span class="pre">=</span> <span class="pre">$model_list</span></code>。如果运行成功,日志将保存在“demo / sentiment / test.log”的路径中。例如,在我们的测试中,最好的模型是<code class="docutils literal"><span class="pre">model_output</span> <span class="pre">/</span> <span class="pre">pass-00002</span></code>,分类误差是0.115645,如下:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Pass</span><span class="o">=</span><span class="mi">0</span> <span class="n">samples</span><span class="o">=</span><span class="mi">24999</span> <span class="n">AvgCost</span><span class="o">=</span><span class="mf">0.280471</span> <span class="n">Eval</span><span class="p">:</span> <span class="n">classification_error_evaluator</span><span class="o">=</span><span class="mf">0.115645</span>
</pre></div>
</div>
</div>
<div class="section" id="">
<span id="id7"></span><h2>预测<a class="headerlink" href="#" title="永久链接至标题"></a></h2>
<p><code class="docutils literal"><span class="pre">predict.py</span></code>脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">cd</span> <span class="n">demo</span><span class="o">/</span><span class="n">sentiment</span>
<span class="o">./</span><span class="n">predict</span><span class="o">.</span><span class="n">sh</span>
</pre></div>
</div>
<p>predict.sh:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span>#Note the default model is pass-00002, you shold make sure the model path
#exists or change the mode path.
model=model_output/pass-00002/
config=trainer_config.py
label=data/pre-imdb/labels.list
cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
     --tconf=$config\
     --model=$model \
     --label=$label \
     --dict=./data/pre-imdb/dict.txt \
     --batch_size=1
</pre></div>
</div>
<ul class="simple">
<li><code class="docutils literal"><span class="pre">cat</span> <span class="pre">./data/aclImdb/test/pos/10007_10.txt</span></code> : 输入预测样本。</li>
<li><code class="docutils literal"><span class="pre">predict.py</span></code> : 预测接口脚本。</li>
<li><code class="docutils literal"><span class="pre">--tconf=$config</span></code> : 设置网络配置。</li>
<li><code class="docutils literal"><span class="pre">--model=$model</span></code> : 设置模型路径。</li>
<li><code class="docutils literal"><span class="pre">--label=$label</span></code> : 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。</li>
<li><code class="docutils literal"><span class="pre">--dict=data/pre-imdb/dict.txt</span></code> : 设置字典文件。</li>
<li><code class="docutils literal"><span class="pre">--batch_size=1</span></code> : 设置batch size。</li>
</ul>
<p>注意应该确保默认模型路径<code class="docutils literal"><span class="pre">model_output</span> <span class="pre">/</span> <span class="pre">pass-00002</span></code>存在或更改为其它模型路径。</p>
<p>本示例的预测结果:</p>
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Loading</span> <span class="n">parameters</span> <span class="kn">from</span> <span class="nn">model_output</span><span class="o">/</span><span class="k">pass</span><span class="o">-</span><span class="mi">00002</span><span class="o">/</span>
<span class="o">./</span><span class="n">data</span><span class="o">/</span><span class="n">aclImdb</span><span class="o">/</span><span class="n">test</span><span class="o">/</span><span class="n">pos</span><span class="o">/</span><span class="mi">10014</span><span class="n">_7</span><span class="o">.</span><span class="n">txt</span><span class="p">:</span> <span class="n">predicting</span> <span class="n">label</span> <span class="ow">is</span> <span class="n">pos</span>
</pre></div>
</div>
<p>我们真诚地感谢您的关注,并欢迎您来参与贡献。</p>
</div>
<div class="section" id="">
<span id="id8"></span><h2>参考文档<a class="headerlink" href="#" title="永久链接至标题"></a></h2>
<p>[1] Brendan O&#8217;Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. <a class="reference external" href="http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf">From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series</a>. In ICWSM-2010. <br>
[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. <a class="reference external" href="http://arxiv.org/abs/1010.3003">Twitter mood predicts the stock market</a>, Journal of Computational Science.<br>
[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. <a class="reference external" href="http://www.cs.toronto.edu/~graves/tpami_2009.pdf">A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence</a>, 31(5):855–868.<br>
[4] Zachary C. Lipton, <a class="reference external" href="http://arxiv.org/abs/1506.00019v1">A Critical Review of Recurrent Neural Networks for Sequence Learning</a>, arXiv:1506.00019. <br>
[5] Jie Zhou and Wei Xu; <a class="reference external" href="http://www.aclweb.org/anthology/P/P15/P15-1109.pdf">End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks</a>; ACL-IJCNLP 2015. <br></p>
</div>
</div>


           </div>
          </div>
          <footer>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2016, PaddlePaddle developers.

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'../../',
            VERSION:'',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
            HAS_SOURCE:  true,
            SOURCELINK_SUFFIX: ".txt",
        };
    </script>
      <script type="text/javascript" src="../../_static/jquery.js"></script>
      <script type="text/javascript" src="../../_static/underscore.js"></script>
      <script type="text/javascript" src="../../_static/doctools.js"></script>
      <script type="text/javascript" src="../../_static/translations.js"></script>
      <script type="text/javascript" src="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"></script>
       
  

  
  
    <script type="text/javascript" src="../../_static/js/theme.js"></script>
  
  
  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
  <script src="../../_static/js/paddle_doc_init.js"></script> 

</body>
</html>