Merge branch 'develop' into ctc_decoder_deploy

59b4b872 · Yibing Liu · 348d6bba · 0a27ca9d · 59b4b872 · 59b4b872
140 changed file
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: convert-markdown-into-html
+        name: convert-markdown-into-html
+        description: Convert README.md into index.html
+        entry: python .pre-commit-hooks/convert_markdown_into_html.py
+        language: system
+        files: .+README\.md$
--- a/.pre-commit-hooks/convert_markdown_into_html.py
+++ b/.pre-commit-hooks/convert_markdown_into_html.py
+import argparse
+import re
+import sys
+HEAD = """
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+"""
+TAIL = """
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
+"""
+def convert_markdown_into_html(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+    retv = 0
+    for filename in args.filenames:
+        with open(
+                re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)),
+                "w") as output:
+            output.write(HEAD)
+            with open(filename) as input:
+                for line in input:
+                    output.write(line)
+            output.write(TAIL)
+    return retv
+if __name__ == '__main__':
+    sys.exit(convert_markdown_into_html())
--- a/.travis.yml
+++ b/.travis.yml
+group: deprecated-2017Q2
 language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
  - linux
 env:
@@ -16,8 +19,12 @@ addons:
      - python2.7-dev
 before_install:
  -  pip install -U virtualenv pre-commit pip
+  -  docker pull paddlepaddle/paddle:latest
 script:
  -  .travis/precommit.sh
+  -  docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c 
+    'cd /py_unittest; sh .travis/unittest.sh'
 notifications:
  email:
    on_success: change

--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
+#!/bin/bash
+abort(){
+    echo "Run unittest failed" 1>&2
+    echo "Please check your code" 1>&2
+    exit 1
+}
+unittest(){
+    cd $1 > /dev/null
+    if [ -f "setup.sh" ]; then
+        sh setup.sh
+    fi
+    if [ $? != 0 ]; then
+        exit 1
+    fi
+    find . -name 'tests' -type d -print0 | \
+        xargs -0 -I{} -n1 bash -c \
+        'python -m unittest discover -v -s {}'
+    cd - > /dev/null
+}
+trap 'abort' 0
+set -e
+for proj in */ ; do
+    if [ -d $proj ]; then
+        unittest $proj
+        if [ $? != 0 ]; then
+            exit 1
+        fi
+    fi
+done
+trap : 0
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
-# models
+# models 简介
-Model configurations
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://github.com/PaddlePaddle/models)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/models)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式构建起千变万化的深度学习模型来解决不同的应用问题。这里，我们针对常见的机器学习任务，提供了不同的神经网络模型供大家学习和使用。
+## 1. 词向量
+词向量用一个实向量表示词语，向量的每个维都表示文本的某种潜在语法或语义特征，是深度学习应用于自然语言处理领域最成功的概念和成果之一。广义的，词向量也可以应用于普通离散特征。词向量的学习通常都是一个无监督的学习过程，因此，可以充分利用海量的无标记数据以捕获特征之间的关系，也可以有效地解决特征稀疏、标签数据缺失、数据噪声等问题。然而，在常见词向量学习方法中，模型最后一层往往会遇到一个超大规模的分类问题，是计算性能的瓶颈。
+在词向量的例子中，我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计（Noise Contrastive Estimation，NCE）来加速词向量的学习。
+- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)
+- 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)
+## 2. 使用循环神经网络语言模型生成文本
+语言模型是自然语言处理领域里一个重要的基础模型，除了得到词向量（语言模型训练的副产物），还可以帮助我们生成文本。给定若干个词，语言模型可以帮助我们预测下一个最可能出现的词。在利用语言模型生成文本的例子中，我们重点介绍循环神经网络语言模型，大家可以通过文档中的使用说明快速适配到自己的训练语料，完成自动写诗、自动写散文等有趣的模型。
+- 2.1 [使用循环神经网络语言模型生成文本](https://github.com/PaddlePaddle/models/tree/develop/generate_sequence_by_rnn_lm)
+## 3. 点击率预估
+点击率预估模型预判用户对一条广告点击的概率，对每次广告的点击情况做出预测，是广告技术的核心算法之一。逻谛斯克回归对大规模稀疏特征有着很好的学习能力，在点击率预估任务发展的早期一统天下。近年来，DNN 模型由于其强大的学习能力逐渐接过点击率预估任务的大旗。
+在点击率预估的例子中，我们给出谷歌提出的 Wide & Deep 模型。这一模型融合了适用于学习抽象特征的 DNN 和适用于大规模稀疏特征的逻谛斯克回归两者模型的优点，可以作为一种相对成熟的模型框架使用， 在工业界也有一定的应用。
+- 3.1 [Wide & deep 点击率预估模型](https://github.com/PaddlePaddle/models/tree/develop/ctr)
+## 4. 文本分类
+文本分类是自然语言处理领域最基础的任务之一，深度学习方法能够免除复杂的特征工程，直接使用原始文本作为输入，数据驱动地最优化分类准确率。
+在文本分类的例子中，我们以情感分类任务为例，提供了基于DNN的非序列文本分类模型，以及基于CNN的序列模型供大家学习和使用（基于LSTM的模型见PaddleBook中[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一课）。
+- 4.1 [基于 DNN / CNN 的情感分类](https://github.com/PaddlePaddle/models/tree/develop/text_classification)
+## 5. 排序学习
+排序学习(Learning to Rank， LTR)是信息检索和搜索引擎研究的核心问题之一，通过机器学习方法学习一个分值函数对待排序的候选进行打分，再根据分值的高低确定序关系。深度神经网络可以用来建模分值函数，构成各类基于深度学习的LTR模型。
+在排序学习的例子中，我们介绍基于 RankLoss 损失函数的 Pairwise 排序模型和基于LambdaRank损失函数的Listwise排序模型(Pointwise学习策略见PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一课）。
+- 5.1 [基于 Pairwise 和 Listwise 的排序学习](https://github.com/PaddlePaddle/models/tree/develop/ltr)
+## 6. 序列标注
+给定输入序列，序列标注模型为序列中每一个元素贴上一个类别标签，是自然语言处理领域最基础的任务之一。随着深度学习的不断探索和发展，利用循环神经网络学习输入序列的特征表示，条件随机场（Conditional Random Field, CRF）在特征基础上完成序列标注任务，逐渐成为解决序列标注问题的标配解决方案。
+在序列标注的例子中，我们以命名实体识别（Named Entity Recognition，NER）任务为例，介绍如何训练一个端到端的序列标注模型。
+- 6.1 [命名实体识别](https://github.com/PaddlePaddle/models/tree/develop/sequence_tagging_for_ner)
+## 7. 序列到序列学习
+序列到序列学习实现两个甚至是多个不定长模型之间的映射，有着广泛的应用，包括：机器翻译、智能对话与问答、广告创意语料生成、自动编码（如金融画像编码）、判断多个文本串之间的语义相关性等。
+在序列到序列学习的例子中，我们以机器翻译任务为例，提供了多种改进模型，供大家学习和使用。包括：不带注意力机制的序列到序列映射模型，这一模型是所有序列到序列学习模型的基础；使用 scheduled sampling 改善 RNN 模型在生成任务中的错误累积问题；带外部记忆机制的神经机器翻译，通过增强神经网络的记忆能力，来完成复杂的序列到序列学习任务。
+- 7.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention)
+## 8. 图像分类
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在图像分类的例子中，我们向大家介绍如何在PaddlePaddle中训练AlexNet、VGG、GoogLeNet和ResNet模型。同时还提供了一个模型转换工具，能够将Caffe训练好的模型文件，转换为PaddlePaddle的模型文件。
+- 8.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle)
+- 8.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+## Copyright and License
+PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
--- a/ctr/README.md
+++ b/ctr/README.md
+# 点击率预估
+## 背景介绍
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
+通常被用来衡量一个在线广告系统的有效性。
+当有多个广告位时，CTR 预估一般会作为排序的基准。
+比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+1.  召回满足 query 的广告集合
+2.  业务规则和相关性过滤
+3.  根据拍卖机制和 CTR 排序
+4.  展出广告
+可以看到，CTR 在最终排序中起到了很重要的作用。
+### 发展阶段
+在业内，CTR 模型经历了如下的发展阶段：
+-   Logistic Regression(LR) / GBDT + 特征工程
+-   LR + DNN 特征
+-   DNN + 特征工程
+在发展早期时 LR 一统天下，但最近 DNN 模型由于其强大的学习能力和逐渐成熟的性能优化，
+逐渐地接过 CTR 预估任务的大旗。
+### LR vs DNN
+下图展示了 LR 和一个 \(3x2\) 的 DNN 模型的结构：
+<p align="center">
+<img src="images/lr_vs_dnn.jpg" width="620" hspace='10'/> <br/>
+Figure 1. LR 和 DNN 模型结构对比
+</p>
+LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
+如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
+这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
+而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
+这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
+本文后面的章节会演示如何使用 PaddlePaddle 编写一个结合两者优点的模型。
+## 数据和任务抽象
+我们可以将 `click` 作为学习目标，任务可以有以下几种方案：
+1.  直接学习 click，0,1 作二元分类
+2.  Learning to rank, 具体用 pairwise rank（标签 1>0）或者 listwise rank
+3.  统计每个广告的点击率，将同一个 query 下的广告两两组合，点击率高的>点击率低的，做 rank 或者分类
+我们直接使用第一种方法做分类任务。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+具体的特征处理方法参看 [data process](./dataset.md)
+## Wide & Deep Learning Model
+谷歌在 16 年提出了 Wide & Deep Learning 的模型框架，用于融合适合学习抽象特征的 DNN 和 适用于大规模稀疏特征的 LR 两种模型的优点。
+### 模型简介
+Wide & Deep Learning Model\[[3](#参考文献)\] 可以作为一种相对成熟的模型框架使用，
+在 CTR 预估的任务中工业界也有一定的应用，因此本文将演示使用此模型来完成 CTR 预估的任务。
+模型结构如下：
+<p align="center">
+<img src="images/wide_deep.png" width="820" hspace='10'/> <br/>
+Figure 2. Wide & Deep Model
+</p>
+模型左边的 Wide 部分，可以容纳大规模系数特征，并且对一些特定的信息（比如 ID）有一定的记忆能力；
+而模型右边的 Deep 部分，能够学习特征间的隐含关系，在相同数量的特征下有更好的学习和推导能力。
+### 编写模型输入
+模型只接受 3 个输入，分别是
+-   `dnn_input` ，也就是 Deep 部分的输入
+-   `lr_input` ，也就是 Wide 部分的输入
+-   `click` ， 点击与否，作为二分类模型学习的标签
+```python
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+```
+### 编写 Wide 部分
+Wide 部分直接使用了 LR 模型，但激活函数改成了 `RELU` 来加速
+```python
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+```
+### 编写 Deep 部分
+Deep 部分使用了标准的多层前向传导的 DNN 模型
+```python
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+```
+### 两者融合
+两个 submodel 的最上层输出加权求和得到整个模型的输出，输出部分使用 `sigmoid` 作为激活函数，得到区间 (0,1) 的预测值，
+来逼近训练数据中二元类别的分布，并最终作为 CTR 预估的值使用。
+```python
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr, wihch is a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+```
+### 训练任务的定义
+```python
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+paddle.init(use_gpu=False, trainer_count=11)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(train_data_path, n_records_as_test=test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" % (
+                event.pass_id, event.batch_id * batch_size, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=1000),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" % (event.pass_id, event.batch_id,
+                                           result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=100)
+```
+## 运行训练和测试
+训练模型需要如下步骤：
+1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+    1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
+    2. 解压 train.gz 得到 train.txt
+2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
+```
+usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
+                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--num_passes NUM_PASSES]
+                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+PaddlePaddle CTR example
+optional arguments:
+  -h, --help            show this help message and exit
+  --train_data_path TRAIN_DATA_PATH
+                        path of training dataset
+  --batch_size BATCH_SIZE
+                        size of mini-batch (default:10000)
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --num_passes NUM_PASSES
+                        number of passes to train
+  --num_lines_to_detact NUM_LINES_TO_DETACT
+                        number of records to detect dataset's meta info
+```
+## 参考文献
+1. <https://en.wikipedia.org/wiki/Click-through_rate>
+2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
+3. Cheng H T, Koc L, Harmsen J, et al. [Wide & deep learning for recommender systems](https://arxiv.org/pdf/1606.07792.pdf)[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.
--- a/ctr/data_provider.py
+++ b/ctr/data_provider.py
+import sys
+import csv
+import numpy as np
+'''
+The fields of the dataset are:
+    0. id: ad identifier
+    1. click: 0/1 for non-click/click
+    2. hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
+    3. C1 -- anonymized categorical variable
+    4. banner_pos
+    5. site_id
+    6. site_domain
+    7. site_category
+    8. app_id
+    9. app_domain
+    10. app_category
+    11. device_id
+    12. device_ip
+    13. device_model
+    14. device_type
+    15. device_conn_type
+    16. C14-C21 -- anonymized categorical variables
+We will treat following fields as categorical features:
+    - C1
+    - banner_pos
+    - site_category
+    - app_category
+    - device_type
+    - device_conn_type
+and some other features as id features:
+    - id
+    - site_id
+    - app_id
+    - device_id
+The `hour` field will be treated as a continuous feature and will be transformed
+to one-hot representation which has 24 bits.
+'''
+feature_dims = {}
+categorial_features = ('C1 banner_pos site_category app_category ' +
+                       'device_type device_conn_type').split()
+id_features = 'id site_id app_id device_id _device_id_cross_site_id'.split()
+def get_all_field_names(mode=0):
+    '''
+    @mode: int
+        0 for train, 1 for test
+    @return: list of str
+    '''
+    return categorial_features + ['hour'] + id_features + ['click'] \
+        if mode == 0 else []
+class CategoryFeatureGenerator(object):
+    '''
+    Generator category features.
+    Register all records by calling `register` first, then call `gen` to generate
+    one-hot representation for a record.
+    '''
+    def __init__(self):
+        self.dic = {'unk': 0}
+        self.counter = 1
+    def register(self, key):
+        '''
+        Register record.
+        '''
+        if key not in self.dic:
+            self.dic[key] = self.counter
+            self.counter += 1
+    def size(self):
+        return len(self.dic)
+    def gen(self, key):
+        '''
+        Generate one-hot representation for a record.
+        '''
+        if key not in self.dic:
+            res = self.dic['unk']
+        else:
+            res = self.dic[key]
+        return [res]
+    def __repr__(self):
+        return '<CategoryFeatureGenerator %d>' % len(self.dic)
+class IDfeatureGenerator(object):
+    def __init__(self, max_dim, cross_fea0=None, cross_fea1=None):
+        '''
+        @max_dim: int
+            Size of the id elements' space
+        '''
+        self.max_dim = max_dim
+        self.cross_fea0 = cross_fea0
+        self.cross_fea1 = cross_fea1
+    def gen(self, key):
+        '''
+        Generate one-hot representation for records
+        '''
+        return [hash(key) % self.max_dim]
+    def gen_cross_fea(self, fea1, fea2):
+        key = str(fea1) + str(fea2)
+        return self.gen(key)
+    def size(self):
+        return self.max_dim
+class ContinuousFeatureGenerator(object):
+    def __init__(self, n_intervals):
+        self.min = sys.maxint
+        self.max = sys.minint
+        self.n_intervals = n_intervals
+    def register(self, val):
+        self.min = min(self.minint, val)
+        self.max = max(self.maxint, val)
+    def gen(self, val):
+        self.len_part = (self.max - self.min) / self.n_intervals
+        return (val - self.min) / self.len_part
+# init all feature generators
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+for key in id_features:
+    # for cross features
+    if 'cross' in key:
+        feas = key[1:].split('_cross_')
+        fields[key] = IDfeatureGenerator(10000000, *feas)
+    # for normal ID features
+    else:
+        fields[key] = IDfeatureGenerator(10000)
+# used as feed_dict in PaddlePaddle
+field_index = dict((key, id)
+                   for id, key in enumerate(['dnn_input', 'lr_input', 'click']))
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+            for key in categorial_features:
+                fields[key].register(row[key])
+    for key, item in fields.items():
+        feature_dims[key] = item.size()
+    #for key in id_features:
+    #feature_dims[key] = id_fea_space
+    feature_dims['hour'] = 24
+    feature_dims['click'] = 1
+    feature_dims['dnn_input'] = np.sum(
+        feature_dims[key] for key in categorial_features + ['hour']) + 1
+    feature_dims['lr_input'] = np.sum(feature_dims[key]
+                                      for key in id_features) + 1
+    return feature_dims
+def concat_sparse_vectors(inputs, dims):
+    '''
+    Concaterate more than one sparse vectors into one.
+    @inputs: list
+        list of sparse vector
+    @dims: list of int
+        dimention of each sparse vector
+    '''
+    res = []
+    assert len(inputs) == len(dims)
+    start = 0
+    for no, vec in enumerate(inputs):
+        for v in vec:
+            res.append(v + start)
+        start += dims[no]
+    return res
+class AvazuDataset(object):
+    '''
+    Load AVAZU dataset as train set.
+    '''
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    def __init__(self, train_path, n_records_as_test=-1):
+        self.train_path = train_path
+        self.n_records_as_test = n_records_as_test
+        # task model: 0 train, 1 test
+        self.mode = 0
+    def train(self):
+        self.mode = self.TRAIN_MODE
+        return self._parse(self.train_path, skip_n_lines=self.n_records_as_test)
+    def test(self):
+        self.mode = self.TEST_MODE
+        return self._parse(self.train_path, top_n_lines=self.n_records_as_test)
+    def _parse(self, path, skip_n_lines=-1, top_n_lines=-1):
+        with open(path, 'rb') as csvfile:
+            reader = csv.DictReader(csvfile)
+            categorial_dims = [
+                feature_dims[key] for key in categorial_features + ['hour']
+            ]
+            id_dims = [feature_dims[key] for key in id_features]
+            for row_id, row in enumerate(reader):
+                if skip_n_lines > 0 and row_id < skip_n_lines:
+                    continue
+                if top_n_lines > 0 and row_id > top_n_lines:
+                    break
+                record = []
+                for key in categorial_features:
+                    record.append(fields[key].gen(row[key]))
+                record.append([int(row['hour'][-2:])])
+                dense_input = concat_sparse_vectors(record, categorial_dims)
+                record = []
+                for key in id_features:
+                    if 'cross' not in key:
+                        record.append(fields[key].gen(row[key]))
+                    else:
+                        fea0 = fields[key].cross_fea0
+                        fea1 = fields[key].cross_fea1
+                        record.append(
+                            fields[key].gen_cross_fea(row[fea0], row[fea1]))
+                sparse_input = concat_sparse_vectors(record, id_dims)
+                record = [dense_input, sparse_input]
+                record.append(list((int(row['click']), )))
+                yield record
+if __name__ == '__main__':
+    path = 'train.txt'
+    print detect_dataset(path, 400000)
+    filereader = AvazuDataset(path)
+    for no, rcd in enumerate(filereader.train()):
+        print no, rcd
+        if no > 1000: break
--- a/ctr/dataset.md
+++ b/ctr/dataset.md
+# 数据及处理
+## 数据集介绍
+数据集使用 `csv` 格式存储，其中各个字段内容如下：
+-   `id` : ad identifier
+-   `click` : 0/1 for non-click/click
+-   `hour` : format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
+-   `C1` : anonymized categorical variable
+-   `banner_pos`
+-   `site_id`
+-   `site_domain`
+-   `site_category`
+-   `app_id`
+-   `app_domain`
+-   `app_category`
+-   `device_id`
+-   `device_ip`
+-   `device_model`
+-   `device_type`
+-   `device_conn_type`
+-   `C14-C21` : anonymized categorical variables
+## 特征提取
+下面我们会简单演示几种特征的提取方式。
+原始数据中的特征可以分为以下几类：
+1.  ID 类特征（稀疏，数量多）
+-   `id`
+-   `site_id`
+-   `app_id`
+-   `device_id`
+2.  类别类特征（稀疏，但数量有限）
+-   `C1`
+-   `site_category`
+-   `device_type`
+-   `C14-C21`
+3.  数值型特征转化为类别型特征
+-   hour (可以转化成数值，也可以按小时为单位转化为类别）
+### 类别类特征
+类别类特征的提取方法有以下两种：
+1.  One-hot 表示作为特征
+2.  类似词向量，用一个 Embedding 将每个类别映射到对应的向量
+### ID 类特征
+ID 类特征的特点是稀疏数据，但量比较大，直接使用 One-hot 表示时维度过大。
+一般会作如下处理：
+1.  确定表示的最大维度 N
+2.  newid = id % N
+3.  用 newid 作为类别类特征使用
+上面的方法尽管存在一定的碰撞概率，但能够处理任意数量的 ID 特征，并保留一定的效果\[[2](#参考文献)\]。
+### 数值型特征
+一般会做如下处理：
+-   归一化，直接作为特征输入模型
+-   用区间分割处理成类别类特征，稀疏化表示，模糊细微上的差别
+## 特征处理
+### 类别型特征
+类别型特征有有限多种值，在模型中，我们一般使用 Embedding将每种值映射为连续值的向量。
+这种特征在输入到模型时，一般使用 One-hot 表示，相关处理方法如下：
+```python
+class CategoryFeatureGenerator(object):
+    '''
+    Generator category features.
+    Register all records by calling ~register~ first, then call ~gen~ to generate
+    one-hot representation for a record.
+    '''
+    def __init__(self):
+        self.dic = {'unk': 0}
+        self.counter = 1
+    def register(self, key):
+        '''
+        Register record.
+        '''
+        if key not in self.dic:
+            self.dic[key] = self.counter
+            self.counter += 1
+    def size(self):
+        return len(self.dic)
+    def gen(self, key):
+        '''
+        Generate one-hot representation for a record.
+        '''
+        if key not in self.dic:
+            res = self.dic['unk']
+        else:
+            res = self.dic[key]
+        return [res]
+    def __repr__(self):
+        return '<CategoryFeatureGenerator %d>' % len(self.dic)
+```
+`CategoryFeatureGenerator` 需要先扫描数据集，得到该类别对应的项集合，之后才能开始生成特征。
+我们的实验数据集\[[3](https://www.kaggle.com/c/avazu-ctr-prediction/data)\]已经经过shuffle，可以扫描前面一定数目的记录来近似总的类别项集合（等价于随机抽样），
+对于没有抽样上的低频类别项，可以用一个 UNK 的特殊值表示。
+```python
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+            for key in categorial_features:
+                fields[key].register(row[key])
+```
+`CategoryFeatureGenerator` 在注册得到数据集中对应类别信息后，可以对相应记录生成对应的特征表示：
+```python
+record = []
+for key in categorial_features:
+    record.append(fields[key].gen(row[key]))
+```
+本任务中，类别类特征会输入到 DNN 中使用。
+### ID 类特征
+ID 类特征代稀疏值，且值的空间很大的情况，一般用模操作规约到一个有限空间，
+之后可以当成类别类特征使用，这里我们会将 ID 类特征输入到 LR 模型中使用。
+```python
+class IDfeatureGenerator(object):
+    def __init__(self, max_dim):
+        '''
+        @max_dim: int
+            Size of the id elements' space
+        '''
+        self.max_dim = max_dim
+    def gen(self, key):
+        '''
+        Generate one-hot representation for records
+        '''
+        return [hash(key) % self.max_dim]
+    def size(self):
+        return self.max_dim
+```
+`IDfeatureGenerator` 不需要预先初始化，可以直接生成特征，比如
+```python
+record = []
+for key in id_features:
+    if 'cross' not in key:
+        record.append(fields[key].gen(row[key]))
+```
+### 交叉类特征
+LR 模型作为 Wide & Deep model 的 `wide` 部分，可以输入很 wide 的数据（特征空间的维度很大），
+为了充分利用这个优势，我们将演示交叉组合特征构建成更大维度特征的情况，之后塞入到模型中训练。
+这里我们依旧使用模操作来约束最终组合出的特征空间的大小，具体实现是直接在 `IDfeatureGenerator` 中添加一个 `gen_cross_feature` 的方法：
+```python
+def gen_cross_fea(self, fea1, fea2):
+    key = str(fea1) + str(fea2)
+    return self.gen(key)
+```
+比如，我们觉得原始数据中， `device_id` 和 `site_id` 有一些关联（比如某个 device 倾向于浏览特定 site)，
+我们通过组合出两者组合来捕捉这类信息。
+```python
+fea0 = fields[key].cross_fea0
+fea1 = fields[key].cross_fea1
+record.append(
+    fields[key].gen_cross_fea(row[fea0], row[fea1]))
+```
+### 特征维度
+#### Deep submodel(DNN)特征
+| feature          | dimention |
+|------------------|-----------|
+| app_category     |        21 |
+| site_category    |        22 |
+| device_conn_type |         5 |
+| hour             |        24 |
+| banner_pos       |         7 |
+| **Total**        | 79        |
+#### Wide submodel(LR)特征
+| Feature             | Dimention |
+|---------------------|-----------|
+| id                  |     10000 |
+| site_id             |     10000 |
+| app_id              |     10000 |
+| device_id           |     10000 |
+| device_id X site_id |   1000000 |
+| **Total**           | 1,040,000 |
+## 输入到 PaddlePaddle 中
+Deep 和 Wide 两部分均以 `sparse_binary_vector` 的格式 \[[1](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst)\] 输入，输入前需要将相关特征拼合，模型最终只接受 3 个 input，
+分别是
+1.  `dnn input` ，DNN 的输入
+2.  `lr input` , LR 的输入
+3.  `click`  ， 标签
+拼合特征的方法：
+```python
+def concat_sparse_vectors(inputs, dims):
+    '''
+    concaterate sparse vectors into one
+    @inputs: list
+        list of sparse vector
+    @dims: list of int
+        dimention of each sparse vector
+    '''
+    res = []
+    assert len(inputs) == len(dims)
+    start = 0
+    for no, vec in enumerate(inputs):
+        for v in vec:
+            res.append(v + start)
+        start += dims[no]
+    return res
+```
+生成最终特征的代码如下：
+```python
+# dimentions of the features
+categorial_dims = [
+    feature_dims[key] for key in categorial_features + ['hour']
+]
+id_dims = [feature_dims[key] for key in id_features]
+dense_input = concat_sparse_vectors(record, categorial_dims)
+sparse_input = concat_sparse_vectors(record, id_dims)
+record = [dense_input, sparse_input]
+record.append(list((int(row['click']), )))
+yield record
+```
+## 参考文献
+1. <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst>
+2. Mikolov T, Deoras A, Povey D, et al. [Strategies for training large scale neural network language models](https://www.researchgate.net/profile/Lukas_Burget/publication/241637478_Strategies_for_training_large_scale_neural_network_language_models/links/542c14960cf27e39fa922ed3.pdf)[C]//Automatic Speech Recognition and Understanding (ASRU), 2011 IEEE Workshop on. IEEE, 2011: 196-201.
+3. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
--- a/ctr/images/lr_vs_dnn.jpg
+++ b/ctr/images/lr_vs_dnn.jpg
--- a/ctr/images/wide_deep.png
+++ b/ctr/images/wide_deep.png
--- a/ctr/index.html
+++ b/ctr/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 点击率预估
+## 背景介绍
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
+通常被用来衡量一个在线广告系统的有效性。
+当有多个广告位时，CTR 预估一般会作为排序的基准。
+比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+1.  召回满足 query 的广告集合
+2.  业务规则和相关性过滤
+3.  根据拍卖机制和 CTR 排序
+4.  展出广告
+可以看到，CTR 在最终排序中起到了很重要的作用。
+### 发展阶段
+在业内，CTR 模型经历了如下的发展阶段：
+-   Logistic Regression(LR) / GBDT + 特征工程
+-   LR + DNN 特征
+-   DNN + 特征工程
+在发展早期时 LR 一统天下，但最近 DNN 模型由于其强大的学习能力和逐渐成熟的性能优化，
+逐渐地接过 CTR 预估任务的大旗。
+### LR vs DNN
+下图展示了 LR 和一个 \(3x2\) 的 DNN 模型的结构：
+<p align="center">
+<img src="images/lr_vs_dnn.jpg" width="620" hspace='10'/> <br/>
+Figure 1. LR 和 DNN 模型结构对比
+</p>
+LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
+如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
+这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
+而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
+这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
+本文后面的章节会演示如何使用 PaddlePaddle 编写一个结合两者优点的模型。
+## 数据和任务抽象
+我们可以将 `click` 作为学习目标，任务可以有以下几种方案：
+1.  直接学习 click，0,1 作二元分类
+2.  Learning to rank, 具体用 pairwise rank（标签 1>0）或者 listwise rank
+3.  统计每个广告的点击率，将同一个 query 下的广告两两组合，点击率高的>点击率低的，做 rank 或者分类
+我们直接使用第一种方法做分类任务。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+具体的特征处理方法参看 [data process](./dataset.md)
+## Wide & Deep Learning Model
+谷歌在 16 年提出了 Wide & Deep Learning 的模型框架，用于融合适合学习抽象特征的 DNN 和 适用于大规模稀疏特征的 LR 两种模型的优点。
+### 模型简介
+Wide & Deep Learning Model\[[3](#参考文献)\] 可以作为一种相对成熟的模型框架使用，
+在 CTR 预估的任务中工业界也有一定的应用，因此本文将演示使用此模型来完成 CTR 预估的任务。
+模型结构如下：
+<p align="center">
+<img src="images/wide_deep.png" width="820" hspace='10'/> <br/>
+Figure 2. Wide & Deep Model
+</p>
+模型左边的 Wide 部分，可以容纳大规模系数特征，并且对一些特定的信息（比如 ID）有一定的记忆能力；
+而模型右边的 Deep 部分，能够学习特征间的隐含关系，在相同数量的特征下有更好的学习和推导能力。
+### 编写模型输入
+模型只接受 3 个输入，分别是
+-   `dnn_input` ，也就是 Deep 部分的输入
+-   `lr_input` ，也就是 Wide 部分的输入
+-   `click` ， 点击与否，作为二分类模型学习的标签
+```python
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+```
+### 编写 Wide 部分
+Wide 部分直接使用了 LR 模型，但激活函数改成了 `RELU` 来加速
+```python
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+```
+### 编写 Deep 部分
+Deep 部分使用了标准的多层前向传导的 DNN 模型
+```python
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+```
+### 两者融合
+两个 submodel 的最上层输出加权求和得到整个模型的输出，输出部分使用 `sigmoid` 作为激活函数，得到区间 (0,1) 的预测值，
+来逼近训练数据中二元类别的分布，并最终作为 CTR 预估的值使用。
+```python
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr, wihch is a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+```
+### 训练任务的定义
+```python
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+paddle.init(use_gpu=False, trainer_count=11)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(train_data_path, n_records_as_test=test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" % (
+                event.pass_id, event.batch_id * batch_size, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=1000),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" % (event.pass_id, event.batch_id,
+                                           result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=100)
+```
+## 运行训练和测试
+训练模型需要如下步骤：
+1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+    1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
+    2. 解压 train.gz 得到 train.txt
+2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
+```
+usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
+                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--num_passes NUM_PASSES]
+                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+PaddlePaddle CTR example
+optional arguments:
+  -h, --help            show this help message and exit
+  --train_data_path TRAIN_DATA_PATH
+                        path of training dataset
+  --batch_size BATCH_SIZE
+                        size of mini-batch (default:10000)
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --num_passes NUM_PASSES
+                        number of passes to train
+  --num_lines_to_detact NUM_LINES_TO_DETACT
+                        number of records to detect dataset's meta info
+```
+## 参考文献
+1. <https://en.wikipedia.org/wiki/Click-through_rate>
+2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
+3. Cheng H T, Koc L, Harmsen J, et al. [Wide & deep learning for recommender systems](https://arxiv.org/pdf/1606.07792.pdf)[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/ctr/train.py
+++ b/ctr/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import logging
+import paddle.v2 as paddle
+from paddle.v2 import layer
+from paddle.v2 import data_type as dtype
+from data_provider import field_index, detect_dataset, AvazuDataset
+parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+parser.add_argument(
+    '--train_data_path',
+    type=str,
+    required=True,
+    help="path of training dataset")
+parser.add_argument(
+    '--batch_size',
+    type=int,
+    default=10000,
+    help="size of mini-batch (default:10000)")
+parser.add_argument(
+    '--test_set_size',
+    type=int,
+    default=10000,
+    help="size of the validation dataset(default: 10000)")
+parser.add_argument(
+    '--num_passes', type=int, default=10, help="number of passes to train")
+parser.add_argument(
+    '--num_lines_to_detact',
+    type=int,
+    default=500000,
+    help="number of records to detect dataset's meta info")
+args = parser.parse_args()
+dnn_layer_dims = [128, 64, 32, 1]
+data_meta_info = detect_dataset(args.train_data_path, args.num_lines_to_detact)
+logging.warning('detect categorical fields in dataset %s' %
+                args.train_data_path)
+for key, item in data_meta_info.items():
+    logging.warning('    - {}\t{}'.format(key, item))
+paddle.init(use_gpu=False, trainer_count=1)
+# ==============================================================================
+#                    input layers
+# ==============================================================================
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+# ==============================================================================
+#                    network structure
+# ==============================================================================
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+# config LR submodel
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr rate, a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0.01)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(
+    args.train_data_path, n_records_as_test=args.test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        num_samples = event.batch_id * args.batch_size
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" %
+                            (event.pass_id, num_samples, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=args.batch_size),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" %
+                            (event.pass_id, event.batch_id, result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=args.batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=args.num_passes)
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
+# Deep Speech 2 on PaddlePaddle
+## Installation
+Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
+```
+sh setup.sh
+export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
+```
+For some machines, we also need to install libsndfile1. Details to be added.
+## Usage
+### Preparing Data
+```
+cd datasets
+sh run_all.sh
+cd ..
+```
+`sh run_all.sh` prepares all ASR datasets (currently, only LibriSpeech available). After running, we have several summarization manifest files in json-format.
+A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcript text, audio duration) of each audio file within the data set, in json format. Manifest file serves as an interface informing our system of  where and what to read the speech samples.
+More help for arguments:
+```
+python datasets/librispeech/librispeech.py --help
+```
+### Preparing for Training
+```
+python compute_mean_std.py
+```
+`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing.
+More help for arguments:
+```
+python compute_mean_std.py --help
+```
+### Training
+For GPU Training:
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py
+```
+For CPU Training:
+```
+python train.py --use_gpu False
+```
+More help for arguments:
+```
+python train.py --help
+```
+### Inferencing
+```
+CUDA_VISIBLE_DEVICES=0 python infer.py
+```
+More help for arguments:
+```
+python infer.py --help
+```
--- a/deep_speech_2/compute_mean_std.py
+++ b/deep_speech_2/compute_mean_std.py
+"""Compute mean and std for feature normalizer, and save to file."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+from data_utils.normalizer import FeatureNormalizer
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+parser = argparse.ArgumentParser(
+    description='Computing mean and stddev for feature normalizer.')
+parser.add_argument(
+    "--manifest_path",
+    default='datasets/manifest.train',
+    type=str,
+    help="Manifest path for computing normalizer's mean and stddev."
+    "(default: %(default)s)")
+parser.add_argument(
+    "--num_samples",
+    default=2000,
+    type=int,
+    help="Number of samples for computing mean and stddev. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='{}',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--output_file",
+    default='mean_std.npz',
+    type=str,
+    help="Filepath to write mean and std to (.npz)."
+    "(default: %(default)s)")
+args = parser.parse_args()
+def main():
+    augmentation_pipeline = AugmentationPipeline(args.augmentation_config)
+    audio_featurizer = AudioFeaturizer()
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=args.manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=args.num_samples)
+    normalizer.write_to_file(args.output_file)
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/data_utils/__init__.py
+++ b/deep_speech_2/data_utils/__init__.py
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
+"""Contains the audio segment class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import io
+import soundfile
+import resampy
+from scipy import signal
+import random
+import copy
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+    def __init__(self, samples, sample_rate):
+        """Create audio segment from samples.
+        Samples are convert float32 internally, with int scaled to [-1, 1].
+        """
+        self._samples = self._convert_samples_to_float32(samples)
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                self.duration, self.rms_db))
+    @classmethod
+    def from_file(cls, file):
+        """Create audio segment from audio file.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        samples, sample_rate = soundfile.read(file, dtype='float32')
+        return cls(samples, sample_rate)
+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param file: Input audio filepath or file object.
+        :type file: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
+        :rtype: AudioSegment
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = 0. if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start)
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return cls(data, sample_rate)
+    @classmethod
+    def from_bytes(cls, bytes):
+        """Create audio segment from a byte string containing audio samples.
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        samples, sample_rate = soundfile.read(
+            io.BytesIO(bytes), dtype='float32')
+        return cls(samples, sample_rate)
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of audio segments together.
+        :param *segments: Input audio segments to be concatenated.
+        :type *segments: tuple of AudioSegment
+        :return: Audio segment instance as concatenating results.
+        :rtype: AudioSegment
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any segments does not match.
+        :raises TypeError: If any segment is not AudioSegment instance.
+        """
+        # Perform basic sanity-checks.
+        if len(segments) == 0:
+            raise ValueError("No audio segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only audio segments of the same type "
+                                "can be concatenated.")
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate)
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and sample rate.
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silent AudioSegment instance of the given duration.
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return cls(samples, sample_rate)
+    def to_wav_file(self, filepath, dtype='float32'):
+        """Save audio segment to disk as wav file.
+        :param filepath: WAV filepath or file object to save the
+                         audio segment.
+        :type filepath: basestring|file
+        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :raises TypeError: If dtype is not supported.
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        subtype_map = {
+            'int16': 'PCM_16',
+            'int32': 'PCM_32',
+            'float32': 'FLOAT',
+            'float64': 'DOUBLE'
+        }
+        soundfile.write(
+            filepath,
+            samples,
+            self._sample_rate,
+            format='WAV',
+            subtype=subtype_map[dtype])
+    def superimpose(self, other):
+        """Add samples from another segment to those of this segment
+        (sample-wise addition, not segment concatenation).
+        Note that this is an in-place transformation.
+        :param other: Segment containing samples to be added in.
+        :type other: AudioSegments
+        :raise TypeError: If type of two segments don't match.
+        :raise ValueError: If the sample rates of the two segments are not
+                           equal, or if the lengths of segments don't match.
+        """
+        if type(self) != type(other):
+            raise TypeError("Cannot add segments of different types: %s "
+                            "and %s." % (type(self), type(other)))
+        if self._sample_rate != other._sample_rate:
+            raise ValueError("Sample rates must match to add segments.")
+        if len(self._samples) != len(other._samples):
+            raise ValueError("Segment lengths must match to add segments.")
+        self._samples += other._samples
+    def to_bytes(self, dtype='float32'):
+        """Create a byte string containing the audio content.
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: Byte string containing audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples.tostring()
+    def gain_db(self, gain):
+        """Apply gain in decibels to samples.
+        Note that this is an in-place transformation.
+        :param gain: Gain in decibels to apply to samples. 
+        :type gain: float
+        """
+        self._samples *= 10.**(gain / 20.)
+    def change_speed(self, speed_rate):
+        """Change the audio speed by linear interpolation.
+        Note that this is an in-place transformation.
+        :param speed_rate: Rate of speed change:
+                           speed_rate > 1.0, speed up the audio;
+                           speed_rate = 1.0, unchanged;
+                           speed_rate < 1.0, slow down the audio;
+                           speed_rate <= 0.0, not allowed, raise ValueError.
+        :type speed_rate: float
+        :raises ValueError: If speed_rate <= 0.0.
+        """
+        if speed_rate <= 0:
+            raise ValueError("speed_rate should be greater than zero.")
+        old_length = self._samples.shape[0]
+        new_length = int(old_length / speed_rate)
+        old_indices = np.arange(old_length)
+        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+        self._samples = np.interp(new_indices, old_indices, self._samples)
+    def normalize(self, target_db=-20, max_gain_db=300.0):
+        """Normalize audio to be of the desired RMS value in decibels.
+        Note that this is an in-place transformation.
+        :param target_db: Target RMS value in decibels. This value should be
+                          less than 0.0 as 0.0 is full-scale audio.
+        :type target_db: float
+        :param max_gain_db: Max amount of gain in dB that can be applied for
+                            normalization. This is to prevent nans when
+                            attempting to normalize a signal consisting of
+                            all zeros.
+        :type max_gain_db: float
+        :raises ValueError: If the required gain to normalize the segment to
+                            the target_db value exceeds max_gain_db.
+        """
+        gain = target_db - self.rms_db
+        if gain > max_gain_db:
+            raise ValueError(
+                "Unable to normalize segment to %f dB because the "
+                "the probable gain have exceeds max_gain_db (%f dB)" %
+                (target_db, max_gain_db))
+        self.gain_db(min(max_gain_db, target_db - self.rms_db))
+    def normalize_online_bayesian(self,
+                                  target_db,
+                                  prior_db,
+                                  prior_samples,
+                                  startup_delay=0.0):
+        """Normalize audio using a production-compatible online/causal
+        algorithm. This uses an exponential likelihood and gamma prior to
+        make online estimates of the RMS even when there are very few samples.
+        Note that this is an in-place transformation.
+        :param target_db: Target RMS value in decibels.
+        :type target_bd: float
+        :param prior_db: Prior RMS estimate in decibels.
+        :type prior_db: float
+        :param prior_samples: Prior strength in number of samples.
+        :type prior_samples: float
+        :param startup_delay: Default 0.0s. If provided, this function will
+                              accrue statistics for the first startup_delay 
+                              seconds before applying online normalization.
+        :type startup_delay: float
+        """
+        # Estimate total RMS online.
+        startup_sample_idx = min(self.num_samples - 1,
+                                 int(self.sample_rate * startup_delay))
+        prior_mean_squared = 10.**(prior_db / 10.)
+        prior_sum_of_squares = prior_mean_squared * prior_samples
+        cumsum_of_squares = np.cumsum(self.samples**2)
+        sample_count = np.arange(self.num_samples) + 1
+        if startup_sample_idx > 0:
+            cumsum_of_squares[:startup_sample_idx] = \
+                cumsum_of_squares[startup_sample_idx]
+            sample_count[:startup_sample_idx] = \
+                sample_count[startup_sample_idx]
+        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
+                                 (sample_count + prior_samples))
+        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
+        # Compute required time-varying gain.
+        gain_db = target_db - rms_estimate_db
+        self.gain_db(gain_db)
+    def resample(self, target_sample_rate, filter='kaiser_best'):
+        """Resample the audio to a target sample rate.
+        Note that this is an in-place transformation.
+        :param target_sample_rate: Target sample rate.
+        :type target_sample_rate: int
+        :param filter: The resampling filter to use one of {'kaiser_best',
+                       'kaiser_fast'}.
+        :type filter: str
+        """
+        self._samples = resampy.resample(
+            self.samples, self.sample_rate, target_sample_rate, filter=filter)
+        self._sample_rate = target_sample_rate
+    def pad_silence(self, duration, sides='both'):
+        """Pad this audio sample with a period of silence.
+        Note that this is an in-place transformation.
+        :param duration: Length of silence in seconds to pad.
+        :type duration: float
+        :param sides: Position for padding:
+                     'beginning' - adds silence in the beginning;
+                     'end' - adds silence in the end;
+                     'both' - adds silence in both the beginning and the end.
+        :type sides: str
+        :raises ValueError: If sides is not supported.
+        """
+        if duration == 0.0:
+            return self
+        cls = type(self)
+        silence = self.make_silence(duration, self._sample_rate)
+        if sides == "beginning":
+            padded = cls.concatenate(silence, self)
+        elif sides == "end":
+            padded = cls.concatenate(self, silence)
+        elif sides == "both":
+            padded = cls.concatenate(silence, self, silence)
+        else:
+            raise ValueError("Unknown value for the sides %s" % sides)
+        self._samples = padded._samples
+    def shift(self, shift_ms):
+        """Shift the audio in time. If `shift_ms` is positive, shift with time
+        advance; if negative, shift with time delay. Silence are padded to
+        keep the duration unchanged.
+        Note that this is an in-place transformation.
+        :param shift_ms: Shift time in millseconds. If positive, shift with
+                         time advance; if negative; shift with time delay.
+        :type shift_ms: float
+        :raises ValueError: If shift_ms is longer than audio duration.
+        """
+        if abs(shift_ms) / 1000.0 > self.duration:
+            raise ValueError("Absolute value of shift_ms should be smaller "
+                             "than audio duration.")
+        shift_samples = int(shift_ms * self._sample_rate / 1000)
+        if shift_samples > 0:
+            # time advance
+            self._samples[:-shift_samples] = self._samples[shift_samples:]
+            self._samples[-shift_samples:] = 0
+        elif shift_samples < 0:
+            # time delay
+            self._samples[-shift_samples:] = self._samples[:shift_samples]
+            self._samples[:-shift_samples] = 0
+    def subsegment(self, start_sec=None, end_sec=None):
+        """Cut the AudioSegment between given boundaries.
+        Note that this is an in-place transformation.
+        :param start_sec: Beginning of subsegment in seconds.
+        :type start_sec: float
+        :param end_sec: End of subsegment in seconds.
+        :type end_sec: float
+        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
+                           of bounds in time.
+        """
+        start_sec = 0.0 if start_sec is None else start_sec
+        end_sec = self.duration if end_sec is None else end_sec
+        if start_sec < 0.0:
+            start_sec = self.duration + start_sec
+        if end_sec < 0.0:
+            end_sec = self.duration + end_sec
+        if start_sec < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_sec)
+        if end_sec < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_sec)
+        if start_sec > end_sec:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_sec, end_sec))
+        if end_sec > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_sec, self.duration))
+        start_sample = int(round(start_sec * self._sample_rate))
+        end_sample = int(round(end_sec * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
+    def random_subsegment(self, subsegment_length, rng=None):
+        """Cut the specified length of the audiosegment randomly.
+        Note that this is an in-place transformation.
+        :param subsegment_length: Subsegment length in seconds.
+        :type subsegment_length: float
+        :param rng: Random number generator state.
+        :type rng: random.Random
+        :raises ValueError: If the length of subsegment is greater than
+                            the origineal segemnt.
+        """
+        rng = random.Random() if rng is None else rng
+        if subsegment_length > self.duration:
+            raise ValueError("Length of subsegment must not be greater "
+                             "than original segment.")
+        start_time = rng.uniform(0.0, self.duration - subsegment_length)
+        self.subsegment(start_time, start_time + subsegment_length)
+    def convolve(self, impulse_segment, allow_resample=False):
+        """Convolve this audio segment with the given impulse segment.
+        Note that this is an in-place transformation.
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample 
+                               rate from this signal.
+        :type allow_resample: bool
+        :raises ValueError: If the sample rate is not match between two
+                            audio segments when resample is not allowed.
+        """
+        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
+            impulse_segment = impulse_segment.resample(self.sample_rate)
+        if self.sample_rate != impulse_segment.sample_rate:
+            raise ValueError("Impulse segment's sample rate (%d Hz) is not"
+                             "equal to base signal sample rate (%d Hz)." %
+                             (impulse_segment.sample_rate, self.sample_rate))
+        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
+                                     "full")
+        self._samples = samples
+    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
+        """Convolve and normalize the resulting audio segment so that it
+        has the same average power as the input signal.
+        Note that this is an in-place transformation.
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
+        """
+        target_db = self.rms_db
+        self.convolve(impulse_segment, allow_resample=allow_resample)
+        self.normalize(target_db)
+    def add_noise(self,
+                  noise,
+                  snr_dB,
+                  allow_downsampling=False,
+                  max_gain_db=300.0,
+                  rng=None):
+        """Add the given noise segment at a specific signal-to-noise ratio.
+        If the noise segment is longer than this segment, a random subsegment
+        of matching length is sampled from it and used instead.
+        Note that this is an in-place transformation.
+        :param noise: Noise signal to add.
+        :type noise: AudioSegment
+        :param snr_dB: Signal-to-Noise Ratio, in decibels.
+        :type snr_dB: float
+        :param allow_downsampling: Whether to allow the noise signal to be
+                                   downsampled to match the base signal sample
+                                   rate.
+        :type allow_downsampling: bool
+        :param max_gain_db: Maximum amount of gain to apply to noise signal
+                            before adding it in. This is to prevent attempting
+                            to apply infinite gain to a zero signal.
+        :type max_gain_db: float
+        :param rng: Random number generator state.
+        :type rng: None|random.Random
+        :raises ValueError: If the sample rate does not match between the two
+                            audio segments when downsampling is not allowed, or
+                            if the duration of noise segments is shorter than
+                            original audio segments.
+        """
+        rng = random.Random() if rng is None else rng
+        if allow_downsampling and noise.sample_rate > self.sample_rate:
+            noise = noise.resample(self.sample_rate)
+        if noise.sample_rate != self.sample_rate:
+            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
+                             "signal sample rate (%d Hz)." % (noise.sample_rate,
+                                                              self.sample_rate))
+        if noise.duration < self.duration:
+            raise ValueError("Noise signal (%f sec) must be at least as long as"
+                             " base signal (%f sec)." %
+                             (noise.duration, self.duration))
+        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
+        noise_new = copy.deepcopy(noise)
+        noise_new.random_subsegment(self.duration, rng=rng)
+        noise_new.gain_db(noise_gain_db)
+        self.superimpose(noise_new)
+    @property
+    def samples(self):
+        """Return audio samples.
+        :return: Audio samples.
+        :rtype: ndarray
+        """
+        return self._samples.copy()
+    @property
+    def sample_rate(self):
+        """Return audio sample rate.
+        :return: Audio sample rate.
+        :rtype: int
+        """
+        return self._sample_rate
+    @property
+    def num_samples(self):
+        """Return number of samples.
+        :return: Number of samples.
+        :rtype: int
+        """
+        return self._samples.shape[0]
+    @property
+    def duration(self):
+        """Return audio duration.
+        :return: Audio duration in seconds.
+        :rtype: float
+        """
+        return self._samples.shape[0] / float(self._sample_rate)
+    @property
+    def rms_db(self):
+        """Return root mean square energy of the audio in decibels.
+        :return: Root mean square energy in decibels.
+        :rtype: float
+        """
+        # square root => multiply by 10 instead of 20 for dBs
+        mean_square = np.mean(self._samples**2)
+        return 10 * np.log10(mean_square)
+    def _convert_samples_to_float32(self, samples):
+        """Convert sample type to float32.
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2**(bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+    def _convert_samples_from_float32(self, samples, dtype):
+        """Convert sample type from float32 to dtype.
+        Audio sample type is usually integer or float-point. For integer
+        type, float32 will be rescaled from [-1, 1] to the maximum range
+        supported by the integer type.
+        This is for writing a audio file.
+        """
+        dtype = np.dtype(dtype)
+        output_samples = samples.copy()
+        if dtype in np.sctypes['int']:
+            bits = np.iinfo(dtype).bits
+            output_samples *= (2**(bits - 1) / 1.)
+            min_val = np.iinfo(dtype).min
+            max_val = np.iinfo(dtype).max
+            output_samples[output_samples > max_val] = max_val
+            output_samples[output_samples < min_val] = min_val
+        elif samples.dtype in np.sctypes['float']:
+            min_val = np.finfo(dtype).min
+            max_val = np.finfo(dtype).max
+            output_samples[output_samples > max_val] = max_val
+            output_samples[output_samples < min_val] = min_val
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return output_samples.astype(dtype)
--- a/deep_speech_2/data_utils/augmentor/__init__.py
+++ b/deep_speech_2/data_utils/augmentor/__init__.py
--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
+"""Contains the data augmentation pipeline."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import random
+from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
+from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
+from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
+from data_utils.augmentor.resample import ResampleAugmentor
+from data_utils.augmentor.online_bayesian_normalization import \
+     OnlineBayesianNormalizationAugmentor
+class AugmentationPipeline(object):
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    .. code-block::
+        '[{"type": "volume",
+           "params": {"min_gain_dBFS": -15,
+                      "max_gain_dBFS": 15},
+           "prob": 0.5},
+          {"type": "speed",
+           "params": {"min_speed_rate": 0.8,
+                      "max_speed_rate": 1.2},
+           "prob": 0.5}
+         ]' 
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect.
+    :param augmentation_config: Augmentation configuration in json string.
+    :type augmentation_config: str
+    :param random_seed: Random seed.
+    :type random_seed: int
+    :raises ValueError: If the augmentation json config is in incorrect format".
+    """
+    def __init__(self, augmentation_config, random_seed=0):
+        self._rng = random.Random(random_seed)
+        self._augmentors, self._rates = self._parse_pipeline_from(
+            augmentation_config)
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            if self._rng.uniform(0., 1.) <= rate:
+                augmentor.transform_audio(audio_segment)
+    def _parse_pipeline_from(self, config_json):
+        """Parse the config json to build a augmentation pipelien."""
+        try:
+            configs = json.loads(config_json)
+            augmentors = [
+                self._get_augmentor(config["type"], config["params"])
+                for config in configs
+            ]
+            rates = [config["prob"] for config in configs]
+        except Exception as e:
+            raise ValueError("Failed to parse the augmentation config json: "
+                             "%s" % str(e))
+        return augmentors, rates
+    def _get_augmentor(self, augmentor_type, params):
+        """Return an augmentation model by the type name, and pass in params."""
+        if augmentor_type == "volume":
+            return VolumePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "shift":
+            return ShiftPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "speed":
+            return SpeedPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "resample":
+            return ResampleAugmentor(self._rng, **params)
+        elif augmentor_type == "bayesian_normal":
+            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
+        else:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/base.py
+++ b/deep_speech_2/data_utils/augmentor/base.py
+"""Contains the abstract base class for augmentation models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+class AugmentorBase(object):
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        pass
--- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+"""Contain the online bayesian normalization augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class OnlineBayesianNormalizationAugmentor(AugmentorBase):
+    """Augmentation model for adding online bayesian normalization.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param target_db: Target RMS value in decibels.
+    :type target_db: float
+    :param prior_db: Prior RMS estimate in decibels.
+    :type prior_db: float
+    :param prior_samples: Prior strength in number of samples.
+    :type prior_samples: int
+    :param startup_delay: Default 0.0s. If provided, this function will
+                          accrue statistics for the first startup_delay 
+                          seconds before applying online normalization.
+    :type starup_delay: float.
+    """
+    def __init__(self,
+                 rng,
+                 target_db,
+                 prior_db,
+                 prior_samples,
+                 startup_delay=0.0):
+        self._target_db = target_db
+        self._prior_db = prior_db
+        self._prior_samples = prior_samples
+        self._rng = rng
+        self._startup_delay = startup_delay
+    def transform_audio(self, audio_segment):
+        """Normalizes the input audio using the online Bayesian approach.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
+                                                self._prior_samples,
+                                                self._startup_delay)
--- a/deep_speech_2/data_utils/augmentor/resample.py
+++ b/deep_speech_2/data_utils/augmentor/resample.py
+"""Contain the resample augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class ResampleAugmentor(AugmentorBase):
+    """Augmentation model for resampling.
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param new_sample_rate: New sample rate in Hz.
+    :type new_sample_rate: int
+    """
+    def __init__(self, rng, new_sample_rate):
+        self._new_sample_rate = new_sample_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Resamples the input audio to a target sample rate.
+        Note that this is an in-place transformation.
+        :param audio: Audio segment to add effects to.
+        :type audio: AudioSegment|SpeechSegment
+        """
+        audio_segment.resample(self._new_sample_rate)
--- a/deep_speech_2/data_utils/augmentor/shift_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/shift_perturb.py
+"""Contains the volume perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class ShiftPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random shift perturbation.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_shift_ms: Minimal shift in milliseconds.
+    :type min_shift_ms: float
+    :param max_shift_ms: Maximal shift in milliseconds.
+    :type max_shift_ms: float
+    """
+    def __init__(self, rng, min_shift_ms, max_shift_ms):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Shift audio.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        audio_segment.shift(shift_ms)
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
+"""Contain the speech perturbation augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class SpeedPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding speed perturbation.
+    See reference paper here:
+    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_speed_rate: Lower bound of new speed rate to sample and should
+                           not be smaller than 0.9.
+    :type min_speed_rate: float
+    :param max_speed_rate: Upper bound of new speed rate to sample and should
+                           not be larger than 1.1.
+    :type max_speed_rate: float
+    """
+    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        if min_speed_rate < 0.9:
+            raise ValueError(
+                "Sampling speed below 0.9 can cause unnatural effects")
+        if max_speed_rate > 1.1:
+            raise ValueError(
+                "Sampling speed above 1.1 can cause unnatural effects")
+        self._min_speed_rate = min_speed_rate
+        self._max_speed_rate = max_speed_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Sample a new speed rate from the given range and
+        changes the speed of the given audio clip.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        sampled_speed = self._rng.uniform(self._min_speed_rate,
+                                          self._max_speed_rate)
+        audio_segment.change_speed(sampled_speed)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
+"""Contains the volume perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class VolumePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random volume perturbation.
+    This is used for multi-loudness training of PCEN. See
+    https://arxiv.org/pdf/1607.05666v1.pdf
+    for more details.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_gain_dBFS: Minimal gain in dBFS.
+    :type min_gain_dBFS: float
+    :param max_gain_dBFS: Maximal gain in dBFS.
+    :type max_gain_dBFS: float
+    """
+    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
+        self._min_gain_dBFS = min_gain_dBFS
+        self._max_gain_dBFS = max_gain_dBFS
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Change audio loadness.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
+        audio_segment.gain_db(gain)
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
+"""Contains data generator for orgnaizing various audio data preprocessing
+pipeline and offering data reader interface of PaddlePaddle requirements.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import numpy as np
+import multiprocessing
+import paddle.v2 as paddle
+from data_utils import utils
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
+from data_utils.speech import SpeechSegment
+from data_utils.normalizer import FeatureNormalizer
+class DataGenerator(object):
+    """
+    DataGenerator provides basic audio data preprocessing pipeline, and offers
+    data reader interfaces of PaddlePaddle requirements.
+    :param vocab_filepath: Vocabulary filepath for indexing tokenized
+                           transcripts.
+    :type vocab_filepath: basestring
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|basestring
+    :param augmentation_config: Augmentation configuration in json string.
+                                Details see AugmentationPipeline.__doc__.
+    :type augmentation_config: str
+    :param max_duration: Audio with duration (in seconds) greater than
+                         this will be discarded.
+    :type max_duration: float
+    :param min_duration: Audio with duration (in seconds) smaller than
+                         this will be discarded.
+    :type min_duration: float
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param use_dB_normalization: Whether to normalize the audio to -20 dB
+                                 before extracting the features.
+    :type use_dB_normalization: bool
+    :param num_threads: Number of CPU threads for processing data.
+    :type num_threads: int
+    :param random_seed: Random seed.
+    :type random_seed: int
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 mean_std_filepath,
+                 augmentation_config='{}',
+                 max_duration=float('inf'),
+                 min_duration=0.0,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 specgram_type='linear',
+                 use_dB_normalization=True,
+                 num_threads=multiprocessing.cpu_count(),
+                 random_seed=0):
+        self._max_duration = max_duration
+        self._min_duration = min_duration
+        self._normalizer = FeatureNormalizer(mean_std_filepath)
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=augmentation_config, random_seed=random_seed)
+        self._speech_featurizer = SpeechFeaturizer(
+            vocab_filepath=vocab_filepath,
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            use_dB_normalization=use_dB_normalization)
+        self._num_threads = num_threads
+        self._rng = random.Random(random_seed)
+        self._epoch = 0
+    def batch_reader_creator(self,
+                             manifest_path,
+                             batch_size,
+                             min_batch_size=1,
+                             padding_to=-1,
+                             flatten=False,
+                             sortagrad=False,
+                             shuffle_method="batch_shuffle"):
+        """
+        Batch data reader creator for audio data. Return a callable generator
+        function to produce batches of data.
+        Audio features within one batch will be padded with zeros to have the
+        same shape, or a user-defined shape.
+        :param manifest_path: Filepath of manifest for audio files.
+        :type manifest_path: basestring
+        :param batch_size: Number of instances in a batch.
+        :type batch_size: int
+        :param min_batch_size: Any batch with batch size smaller than this will
+                               be discarded. (To be deprecated in the future.)
+        :type min_batch_size: int
+        :param padding_to:  If set -1, the maximun shape in the batch
+                            will be used as the target shape for padding.
+                            Otherwise, `padding_to` will be the target shape.
+        :type padding_to: int
+        :param flatten: If set True, audio features will be flatten to 1darray.
+        :type flatten: bool
+        :param sortagrad: If set True, sort the instances by audio duration
+                          in the first epoch for speed up training.
+        :type sortagrad: bool
+        :param shuffle_method: Shuffle method. Options:
+                                '' or None: no shuffle.
+                                'instance_shuffle': instance-wise shuffle.
+                                'batch_shuffle': similarly-sized instances are
+                                                 put into batches, and then
+                                                 batch-wise shuffle the batches.
+                                                 For more details, please see
+                                                 ``_batch_shuffle.__doc__``.
+                                'batch_shuffle_clipped': 'batch_shuffle' with
+                                                         head shift and tail
+                                                         clipping. For more
+                                                         details, please see
+                                                         ``_batch_shuffle``.
+                              If sortagrad is True, shuffle is disabled
+                              for the first epoch.
+        :type shuffle_method: None|str
+        :return: Batch reader function, producing batches of data when called.
+        :rtype: callable
+        """
+        def batch_reader():
+            # read manifest
+            manifest = utils.read_manifest(
+                manifest_path=manifest_path,
+                max_duration=self._max_duration,
+                min_duration=self._min_duration)
+            # sort (by duration) or batch-wise shuffle the manifest
+            if self._epoch == 0 and sortagrad:
+                manifest.sort(key=lambda x: x["duration"])
+            else:
+                if shuffle_method == "batch_shuffle":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=False)
+                elif shuffle_method == "batch_shuffle_clipped":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=True)
+                elif shuffle_method == "instance_shuffle":
+                    self._rng.shuffle(manifest)
+                elif not shuffle_method:
+                    pass
+                else:
+                    raise ValueError("Unknown shuffle method %s." %
+                                     shuffle_method)
+            # prepare batches
+            instance_reader = self._instance_reader_creator(manifest)
+            batch = []
+            for instance in instance_reader():
+                batch.append(instance)
+                if len(batch) == batch_size:
+                    yield self._padding_batch(batch, padding_to, flatten)
+                    batch = []
+            if len(batch) >= min_batch_size:
+                yield self._padding_batch(batch, padding_to, flatten)
+            self._epoch += 1
+        return batch_reader
+    @property
+    def feeding(self):
+        """Returns data reader's feeding dict.
+        :return: Data feeding dict.
+        :rtype: dict 
+        """
+        return {"audio_spectrogram": 0, "transcript_text": 1}
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._speech_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._speech_featurizer.vocab_list
+    def _process_utterance(self, filename, transcript):
+        """Load, augment, featurize and normalize for speech data."""
+        speech_segment = SpeechSegment.from_file(filename, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, text_ids
+    def _instance_reader_creator(self, manifest):
+        """
+        Instance reader creator. Create a callable function to produce
+        instances of data.
+        Instance: a tuple of ndarray of audio spectrogram and a list of
+        token indices for transcript.
+        """
+        def reader():
+            for instance in manifest:
+                yield instance
+        def mapper(instance):
+            return self._process_utterance(instance["audio_filepath"],
+                                           instance["text"])
+        return paddle.reader.xmap_readers(
+            mapper, reader, self._num_threads, 1024, order=True)
+    def _padding_batch(self, batch, padding_to=-1, flatten=False):
+        """
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one bach.
+        If ``padding_to`` is -1, the maximun shape in the batch will be used
+        as the target shape for padding. Otherwise, `padding_to` will be the
+        target shape (only refers to the second axis).
+        If `flatten` is True, features will be flatten to 1darray.
+        """
+        new_batch = []
+        # get target shape
+        max_length = max([audio.shape[1] for audio, text in batch])
+        if padding_to != -1:
+            if padding_to < max_length:
+                raise ValueError("If padding_to is not -1, it should be larger "
+                                 "than any instance's shape in the batch")
+            max_length = padding_to
+        # padding
+        for audio, text in batch:
+            padded_audio = np.zeros([audio.shape[0], max_length])
+            padded_audio[:, :audio.shape[1]] = audio
+            if flatten:
+                padded_audio = padded_audio.flatten()
+            new_batch.append((padded_audio, text))
+        return new_batch
+    def _batch_shuffle(self, manifest, batch_size, clipped=False):
+        """Put similarly-sized instances into minibatches for better efficiency
+        and make a batch-wise shuffle.
+        1. Sort the audio clips by duration.
+        2. Generate a random number `k`, k in [0, batch_size).
+        3. Randomly shift `k` instances in order to create different batches
+           for different epochs. Create minibatches.
+        4. Shuffle the minibatches.
+        :param manifest: Manifest contents. List of dict.
+        :type manifest: list
+        :param batch_size: Batch size. This size is also used for generate
+                           a random number for batch shuffle.
+        :type batch_size: int
+        :param clipped: Whether to clip the heading (small shift) and trailing
+                        (incomplete batch) instances.
+        :type clipped: bool
+        :return: Batch shuffled mainifest.
+        :rtype: list
+        """
+        manifest.sort(key=lambda x: x["duration"])
+        shift_len = self._rng.randint(0, batch_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
+        self._rng.shuffle(batch_manifest)
+        batch_manifest = list(sum(batch_manifest, ()))
+        if not clipped:
+            res_len = len(manifest) - shift_len - len(batch_manifest)
+            batch_manifest.extend(manifest[-res_len:])
+            batch_manifest.extend(manifest[0:shift_len])
+        return batch_manifest
--- a/deep_speech_2/data_utils/featurizer/__init__.py
+++ b/deep_speech_2/data_utils/featurizer/__init__.py
--- a/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+"""Contains the audio featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from data_utils import utils
+from data_utils.audio import AudioSegment
+class AudioFeaturizer(object):
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+    Currently, it only supports feature type of linear spectrogram.
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._specgram_type = specgram_type
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsamplling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment.samples,
+                                      audio_segment.sample_rate)
+    def _compute_specgram(self, samples, sample_rate):
+        """Extract various audio features."""
+        if self._specgram_type == 'linear':
+            return self._compute_linear_specgram(
+                samples, sample_rate, self._stride_ms, self._window_ms,
+                self._max_freq)
+        else:
+            raise ValueError("Unknown specgram_type %s. "
+                             "Supported values: linear." % self._specgram_type)
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        return np.log(specgram[:ind, :] + eps)
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        fft = np.fft.rfft(windows * weighting, axis=0)
+        fft = np.absolute(fft)**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+"""Contains the speech featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+from data_utils.featurizer.text_featurizer import TextFeaturizer
+class SpeechFeaturizer(object):
+    """Speech featurizer, for extracting features from both audio and transcript
+    contents of SpeechSegment.
+    Currently, for audio parts, it only supports feature type of linear
+    spectrogram; for transcript parts, it only supports char-level tokenizing
+    and conversion into a list of token indices. Note that the token indexing
+    order follows the given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: basestring
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param target_sample_rate: Speech are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB)
+        self._text_featurizer = TextFeaturizer(vocab_filepath)
+    def featurize(self, speech_segment):
+        """Extract features for speech segment.
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, convert text string to a list of token indices
+           in char-level.
+        :param audio_segment: Speech segment to extract features from.
+        :type audio_segment: SpeechSegment
+        :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
+                 char-level token indices.
+        :rtype: tuple
+        """
+        audio_feature = self._audio_featurizer.featurize(speech_segment)
+        text_ids = self._text_featurizer.featurize(speech_segment.transcript)
+        return audio_feature, text_ids
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._text_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._text_featurizer.vocab_list
--- a/deep_speech_2/data_utils/featurizer/text_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/text_featurizer.py
+"""Contains the text featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+class TextFeaturizer(object):
+    """Text featurizer, for processing or extracting features from text.
+    Currently, it only supports char-level tokenizing and conversion into
+    a list of token indices. Note that the token indexing order follows the
+    given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: basestring
+    """
+    def __init__(self, vocab_filepath):
+        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+            vocab_filepath)
+    def featurize(self, text):
+        """Convert text string to a list of token indices in char-level.Note
+        that the token indexing order follows the given vocabulary file.
+        :param text: Text to process.
+        :type text: basestring
+        :return: List of char-level token indices.
+        :rtype: list
+        """
+        tokens = self._char_tokenize(text)
+        return [self._vocab_dict[token] for token in tokens]
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return len(self._vocab_list)
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._vocab_list
+    def _char_tokenize(self, text):
+        """Character tokenizer."""
+        return list(text.strip())
+    def _load_vocabulary_from_file(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with open(vocab_filepath, 'r') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
--- a/deep_speech_2/data_utils/normalizer.py
+++ b/deep_speech_2/data_utils/normalizer.py
+"""Contains feature normalizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import random
+import data_utils.utils as utils
+from data_utils.audio import AudioSegment
+class FeatureNormalizer(object):
+    """Feature normalizer. Normalize features to be of zero mean and unit
+    stddev.
+    if mean_std_filepath is provided (not None), the normalizer will directly
+    initilize from the file. Otherwise, both manifest_path and featurize_func
+    should be given for on-the-fly mean and stddev computing.
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|basestring
+    :param manifest_path: Manifest of instances for computing mean and stddev.
+    :type meanifest_path: None|basestring
+    :param featurize_func: Function to extract features. It should be callable
+                           with ``featurize_func(audio_segment)``.
+    :type featurize_func: None|callable
+    :param num_samples: Number of random samples for computing mean and stddev.
+    :type num_samples: int
+    :param random_seed: Random seed for sampling instances.
+    :type random_seed: int
+    :raises ValueError: If both mean_std_filepath and manifest_path
+                        (or both mean_std_filepath and featurize_func) are None.
+    """
+    def __init__(self,
+                 mean_std_filepath,
+                 manifest_path=None,
+                 featurize_func=None,
+                 num_samples=500,
+                 random_seed=0):
+        if not mean_std_filepath:
+            if not (manifest_path and featurize_func):
+                raise ValueError("If mean_std_filepath is None, meanifest_path "
+                                 "and featurize_func should not be None.")
+            self._rng = random.Random(random_seed)
+            self._compute_mean_std(manifest_path, featurize_func, num_samples)
+        else:
+            self._read_mean_std_from_file(mean_std_filepath)
+    def apply(self, features, eps=1e-14):
+        """Normalize features to be of zero mean and unit stddev.
+        :param features: Input features to be normalized.
+        :type features: ndarray
+        :param eps:  added to stddev to provide numerical stablibity.
+        :type eps: float
+        :return: Normalized features.
+        :rtype: ndarray
+        """
+        return (features - self._mean) / (self._std + eps)
+    def write_to_file(self, filepath):
+        """Write the mean and stddev to the file.
+        :param filepath: File to write mean and stddev.
+        :type filepath: basestring
+        """
+        np.savez(filepath, mean=self._mean, std=self._std)
+    def _read_mean_std_from_file(self, filepath):
+        """Load mean and std from file."""
+        npzfile = np.load(filepath)
+        self._mean = npzfile["mean"]
+        self._std = npzfile["std"]
+    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
+        """Compute mean and std from randomly sampled instances."""
+        manifest = utils.read_manifest(manifest_path)
+        sampled_manifest = self._rng.sample(manifest, num_samples)
+        features = []
+        for instance in sampled_manifest:
+            features.append(
+                featurize_func(
+                    AudioSegment.from_file(instance["audio_filepath"])))
+        features = np.hstack(features)
+        self._mean = np.mean(features, axis=1).reshape([-1, 1])
+        self._std = np.std(features, axis=1).reshape([-1, 1])
--- a/deep_speech_2/data_utils/speech.py
+++ b/deep_speech_2/data_utils/speech.py
+"""Contains the speech segment class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.audio import AudioSegment
+class SpeechSegment(AudioSegment):
+    """Speech segment abstraction, a subclass of AudioSegment,
+    with an additional transcript.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :param transcript: Transcript text for the speech.
+    :type transript: basestring
+    :raises TypeError: If the sample data type is not float or int.
+    """
+    def __init__(self, samples, sample_rate, transcript):
+        AudioSegment.__init__(self, samples, sample_rate)
+        self._transcript = transcript
+    def __eq__(self, other):
+        """Return whether two objects are equal.
+        """
+        if not AudioSegment.__eq__(self, other):
+            return False
+        if self._transcript != other._transcript:
+            return False
+        return True
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+    @classmethod
+    def from_file(cls, filepath, transcript):
+        """Create speech segment from audio file and corresponding transcript.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :param transcript: Transcript text for the speech.
+        :type transript: basestring
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        audio = AudioSegment.from_file(filepath)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def from_bytes(cls, bytes, transcript):
+        """Create speech segment from a byte string and corresponding
+        transcript.
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :param transcript: Transcript text for the speech.
+        :type transript: basestring
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        audio = AudioSegment.from_bytes(bytes)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
+        :param *segments: Input speech segments to be concatenated.
+        :type *segments: tuple of SpeechSegment
+        :return: Speech segment instance.
+        :rtype: SpeechSegment
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If any segment is not SpeechSegment instance.
+        """
+        if len(segments) == 0:
+            raise ValueError("No speech segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        transcripts = ""
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only speech segments of the same type "
+                                "instance can be concatenated.")
+            transcripts += seg._transcript
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate, transcripts)
+    @classmethod
+    def slice_from_file(cls, filepath, transcript, start=None, end=None):
+        """Loads a small section of an speech without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :param transcript: Transcript text for the speech. if not provided, 
+                           the defaults is an empty string.
+        :type transript: basestring
+        :return: SpeechSegment instance of the specified slice of the input
+                 speech file.
+        :rtype: SpeechSegment
+        """
+        audio = Audiosegment.slice_from_file(filepath, start, end)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent speech segment of the given duration and
+        sample rate, transcript will be an empty string.
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silence of the given duration.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.make_silence(duration, sample_rate)
+        return cls(audio.samples, audio.sample_rate, "")
+    @property
+    def transcript(self):
+        """Return the transcript text.
+        :return: Transcript text for the speech.
+        :rtype: basestring
+        """
+        return self._transcript
--- a/deep_speech_2/data_utils/utils.py
+++ b/deep_speech_2/data_utils/utils.py
+"""Contains data helper functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
+    """Load and parse manifest file.
+    Instances with durations outside [min_duration, max_duration] will be
+    filtered out.
+    :param manifest_path: Manifest file to load and parse. 
+    :type manifest_path: basestring
+    :param max_duration: Maximal duration in seconds for instance filter.
+    :type max_duration: float
+    :param min_duration: Minimal duration in seconds for instance filter.
+    :type min_duration: float
+    :return: Manifest parsing results. List of dict.
+    :rtype: list
+    :raises IOError: If failed to parse the manifest.
+    """
+    manifest = []
+    for json_line in open(manifest_path):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+        if (json_data["duration"] <= max_duration and
+                json_data["duration"] >= min_duration):
+            manifest.append(json_data)
+    return manifest
--- a/deep_speech_2/datasets/librispeech/librispeech.py
+++ b/deep_speech_2/datasets/librispeech/librispeech.py
+"""Prepare Librispeech ASR datasets.
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import os
+import wget
+import tarfile
+import argparse
+import soundfile
+import json
+from paddle.v2.dataset.common import md5file
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
+URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
+URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
+URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
+URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
+MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
+MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
+MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
+MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Libri",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--full_download",
+    default="True",
+    type=distutils.util.strtobool,
+    help="Download all datasets for Librispeech."
+    " If False, only download a minimal requirement (test-clean, dev-clean"
+    " train-clean-100). (default: %(default)s)")
+args = parser.parse_args()
+def download(url, md5sum, target_dir):
+    """
+    Download file from url to target_dir, and check md5sum.
+    """
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        wget.download(url, target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir):
+    """
+    Unpack the file to the target_dir.
+    """
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+def create_manifest(data_dir, manifest_path):
+    """
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
+            for line in open(text_filepath):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(data_dir, subfolder,
+                                              segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with open(manifest_path, 'w') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """
+    Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+def main():
+    prepare_dataset(
+        url=URL_TEST_CLEAN,
+        md5sum=MD5_TEST_CLEAN,
+        target_dir=os.path.join(args.target_dir, "test-clean"),
+        manifest_path=args.manifest_prefix + ".test-clean")
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+    prepare_dataset(
+        url=URL_TRAIN_CLEAN_100,
+        md5sum=MD5_TRAIN_CLEAN_100,
+        target_dir=os.path.join(args.target_dir, "train-clean-100"),
+        manifest_path=args.manifest_prefix + ".train-clean-100")
+    if args.full_download:
+        prepare_dataset(
+            url=URL_TEST_OTHER,
+            md5sum=MD5_TEST_OTHER,
+            target_dir=os.path.join(args.target_dir, "test-other"),
+            manifest_path=args.manifest_prefix + ".test-other")
+        prepare_dataset(
+            url=URL_DEV_OTHER,
+            md5sum=MD5_DEV_OTHER,
+            target_dir=os.path.join(args.target_dir, "dev-other"),
+            manifest_path=args.manifest_prefix + ".dev-other")
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_360,
+            md5sum=MD5_TRAIN_CLEAN_360,
+            target_dir=os.path.join(args.target_dir, "train-clean-360"),
+            manifest_path=args.manifest_prefix + ".train-clean-360")
+        prepare_dataset(
+            url=URL_TRAIN_OTHER_500,
+            md5sum=MD5_TRAIN_OTHER_500,
+            target_dir=os.path.join(args.target_dir, "train-other-500"),
+            manifest_path=args.manifest_prefix + ".train-other-500")
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/datasets/run_all.sh
+++ b/deep_speech_2/datasets/run_all.sh
+cd librispeech
+python librispeech.py
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+cd -
+cat librispeech/manifest.train* | shuf > manifest.train
+cat librispeech/manifest.dev-clean > manifest.dev
+cat librispeech/manifest.test-clean > manifest.test
+echo "All done."
--- a/deep_speech_2/datasets/vocab/eng_vocab.txt
+++ b/deep_speech_2/datasets/vocab/eng_vocab.txt
+'
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
--- a/deep_speech_2/decoder.py
+++ b/deep_speech_2/decoder.py
+"""Contains various CTC decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from itertools import groupby
+def ctc_best_path_decode(probs_seq, vocabulary):
+    """Best path decoding, also called argmax decoding or greedy decoding.
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+def ctc_decode(probs_seq, vocabulary, method):
+    """CTC-like sequence decoding from a sequence of likelihood probablilites.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param method: Decoding method name, with options: "best_path".
+    :type method: basestring
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("probs dimension mismatchedd with vocabulary")
+    if method == "best_path":
+        return ctc_best_path_decode(probs_seq, vocabulary)
+    else:
+        raise ValueError("Decoding method [%s] is not supported.")
--- a/deep_speech_2/error_rate.py
+++ b/deep_speech_2/error_rate.py
+# -*- coding: utf-8 -*-
+"""This module provides functions to calculate error rate in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def _levenshtein_distance(ref, hyp):
+    """Levenshtein distance is a string metric for measuring the difference between
+    two sequences. Informally, the levenshtein disctance is defined as the minimum
+    number of single-character edits (substitutions, insertions or deletions) 
+    required to change one word into the other. We can naturally extend the edits to 
+    word level when calculate levenshtein disctance for two sentences.
+    """
+    ref_len = len(ref)
+    hyp_len = len(hyp)
+    # special case
+    if ref == hyp:
+        return 0
+    if ref_len == 0:
+        return hyp_len
+    if hyp_len == 0:
+        return ref_len
+    distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
+    # initialize distance matrix
+    for j in xrange(hyp_len + 1):
+        distance[0][j] = j
+    for i in xrange(ref_len + 1):
+        distance[i][0] = i
+    # calculate levenshtein distance
+    for i in xrange(1, ref_len + 1):
+        for j in xrange(1, hyp_len + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                distance[i][j] = distance[i - 1][j - 1]
+            else:
+                s_num = distance[i - 1][j - 1] + 1
+                i_num = distance[i][j - 1] + 1
+                d_num = distance[i - 1][j] + 1
+                distance[i][j] = min(s_num, i_num, d_num)
+    return distance[ref_len][hyp_len]
+def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Calculate word error rate (WER). WER compares reference text and 
+    hypothesis text in word-level. WER is defined as:
+    .. math::
+        WER = (Sw + Dw + Iw) / Nw
+    where
+    .. code-block:: text
+        Sw is the number of words subsituted,
+        Dw is the number of words deleted,
+        Iw is the number of words inserted,
+        Nw is the number of words in the reference
+    We can use levenshtein distance to calculate WER. Please draw an attention that 
+    empty items will be removed when splitting sentences by delimiter.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Word error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+    ref_words = filter(None, reference.split(delimiter))
+    hyp_words = filter(None, hypothesis.split(delimiter))
+    if len(ref_words) == 0:
+        raise ValueError("Reference's word number should be greater than 0.")
+    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    wer = float(edit_distance) / len(ref_words)
+    return wer
+def cer(reference, hypothesis, ignore_case=False):
+    """Calculate charactor error rate (CER). CER compares reference text and
+    hypothesis text in char-level. CER is defined as:
+    .. math::
+        CER = (Sc + Dc + Ic) / Nc
+    where
+    .. code-block:: text
+        Sc is the number of characters substituted,
+        Dc is the number of characters deleted,
+        Ic is the number of characters inserted
+        Nc is the number of characters in the reference
+    We can use levenshtein distance to calculate CER. Chinese input should be 
+    encoded to unicode. Please draw an attention that the leading and tailing 
+    white space characters will be truncated and multiple consecutive white 
+    space characters in a sentence will be replaced by one white space character.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :return: Character error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+    reference = ' '.join(filter(None, reference.split(' ')))
+    hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
+    if len(reference) == 0:
+        raise ValueError("Length of reference should be greater than 0.")
+    edit_distance = _levenshtein_distance(reference, hypothesis)
+    cer = float(edit_distance) / len(reference)
+    return cer
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
+"""Inferer for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import gzip
+import distutils.util
+import multiprocessing
+import paddle.v2 as paddle
+from data_utils.data import DataGenerator
+from model import deep_speech2
+from decoder import ctc_decode
+import utils
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--num_samples",
+    default=10,
+    type=int,
+    help="Number of samples for inference. (default: %(default)s)")
+parser.add_argument(
+    "--num_conv_layers",
+    default=2,
+    type=int,
+    help="Convolution layer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_rnn_layers",
+    default=3,
+    type=int,
+    help="RNN layer number. (default: %(default)s)")
+parser.add_argument(
+    "--rnn_layer_size",
+    default=512,
+    type=int,
+    help="RNN layer cell number. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
+parser.add_argument(
+    "--mean_std_filepath",
+    default='mean_std.npz',
+    type=str,
+    help="Manifest path for normalizer. (default: %(default)s)")
+parser.add_argument(
+    "--decode_manifest_path",
+    default='datasets/manifest.test',
+    type=str,
+    help="Manifest path for decoding. (default: %(default)s)")
+parser.add_argument(
+    "--model_filepath",
+    default='checkpoints/params.latest.tar.gz',
+    type=str,
+    help="Model filepath. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='datasets/vocab/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
+args = parser.parse_args()
+def infer():
+    """Max-ctc-decoding for DeepSpeech2."""
+    # initialize data generator
+    data_generator = DataGenerator(
+        vocab_filepath=args.vocab_filepath,
+        mean_std_filepath=args.mean_std_filepath,
+        augmentation_config='{}',
+        num_threads=args.num_threads_data)
+    # create network config
+    # paddle.data_type.dense_array is used for variable batch input.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
+    audio_data = paddle.layer.data(
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
+    text_data = paddle.layer.data(
+        name="transcript_text",
+        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
+    output_probs = deep_speech2(
+        audio_data=audio_data,
+        text_data=text_data,
+        dict_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_size=args.rnn_layer_size,
+        is_inference=True)
+    # load parameters
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(args.model_filepath))
+    # prepare infer data
+    batch_reader = data_generator.batch_reader_creator(
+        manifest_path=args.decode_manifest_path,
+        batch_size=args.num_samples,
+        sortagrad=False,
+        shuffle_method=None)
+    infer_data = batch_reader().next()
+    # run inference
+    infer_results = paddle.infer(
+        output_layer=output_probs, parameters=parameters, input=infer_data)
+    num_steps = len(infer_results) // len(infer_data)
+    probs_split = [
+        infer_results[i * num_steps:(i + 1) * num_steps]
+        for i in xrange(len(infer_data))
+    ]
+    # decode and print
+    for i, probs in enumerate(probs_split):
+        output_transcription = ctc_decode(
+            probs_seq=probs,
+            vocabulary=data_generator.vocab_list,
+            method="best_path")
+        target_transcription = ''.join(
+            [data_generator.vocab_list[index] for index in infer_data[i][1]])
+        print("Target Transcription: %s \nOutput Transcription: %s \n" %
+              (target_transcription, output_transcription))
+def main():
+    utils.print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
+    infer()
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/model.py
+++ b/deep_speech_2/model.py
+"""Contains DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.v2 as paddle
+def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
+                  padding, act):
+    """
+    Convolution layer with batch normalization.
+    """
+    conv_layer = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=num_channels_in,
+        num_filters=num_channels_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=conv_layer, act=act)
+def bidirectional_simple_rnn_bn_layer(name, input, size, act):
+    """
+    Bidirectonal simple rnn layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+    """
+    # input-hidden weights shared across bi-direcitonal rnn.
+    input_proj = paddle.layer.fc(
+        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
+    # batch norm is only performed on input-state projection 
+    input_proj_bn = paddle.layer.batch_norm(
+        input=input_proj, act=paddle.activation.Linear())
+    # forward and backward in time
+    forward_simple_rnn = paddle.layer.recurrent(
+        input=input_proj_bn, act=act, reverse=False)
+    backward_simple_rnn = paddle.layer.recurrent(
+        input=input_proj_bn, act=act, reverse=True)
+    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
+def conv_group(input, num_stacks):
+    """
+    Convolution group with several stacking convolution layers.
+    """
+    conv = conv_bn_layer(
+        input=input,
+        filter_size=(11, 41),
+        num_channels_in=1,
+        num_channels_out=32,
+        stride=(3, 2),
+        padding=(5, 20),
+        act=paddle.activation.BRelu())
+    for i in xrange(num_stacks - 1):
+        conv = conv_bn_layer(
+            input=conv,
+            filter_size=(11, 21),
+            num_channels_in=32,
+            num_channels_out=32,
+            stride=(1, 2),
+            padding=(5, 10),
+            act=paddle.activation.BRelu())
+    output_num_channels = 32
+    output_height = 160 // pow(2, num_stacks) + 1
+    return conv, output_num_channels, output_height
+def rnn_group(input, size, num_stacks):
+    """
+    RNN group with several stacking RNN layers.
+    """
+    output = input
+    for i in xrange(num_stacks):
+        output = bidirectional_simple_rnn_bn_layer(
+            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
+    return output
+def deep_speech2(audio_data,
+                 text_data,
+                 dict_size,
+                 num_conv_layers=2,
+                 num_rnn_layers=3,
+                 rnn_size=256,
+                 is_inference=False):
+    """
+    The whole DeepSpeech2 model structure (a simplified version).
+    :param audio_data: Audio spectrogram data layer.
+    :type audio_data: LayerOutput
+    :param text_data: Transcription text data layer.
+    :type text_data: LayerOutput
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (number of RNN cells).
+    :type rnn_size: int
+    :param is_inference: False in the training mode, and True in the
+                         inferene mode.
+    :type is_inference: bool
+    :return: If is_inference set False, return a ctc cost layer;
+             if is_inference set True, return a sequence layer of output
+             probability distribution.
+    :rtype: tuple of LayerOutput
+    """
+    # convolution group
+    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
+        input=audio_data, num_stacks=num_conv_layers)
+    # convert data form convolution feature map to sequence of vectors
+    conv2seq = paddle.layer.block_expand(
+        input=conv_group_output,
+        num_channels=conv_group_num_channels,
+        stride_x=1,
+        stride_y=1,
+        block_x=1,
+        block_y=conv_group_height)
+    # rnn group
+    rnn_group_output = rnn_group(
+        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
+    fc = paddle.layer.fc(
+        input=rnn_group_output,
+        size=dict_size + 1,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+    if is_inference:
+        # probability distribution with softmax
+        return paddle.layer.mixed(
+            input=paddle.layer.identity_projection(input=fc),
+            act=paddle.activation.Softmax())
+    else:
+        # ctc cost
+        return paddle.layer.warp_ctc(
+            input=fc,
+            label=text_data,
+            size=dict_size + 1,
+            blank=dict_size,
+            norm_by_times=True)
--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
+wget==3.2
+scipy==0.13.1
+resampy==0.1.5
\ No newline at end of file
--- a/deep_speech_2/setup.sh
+++ b/deep_speech_2/setup.sh
+#!/bin/bash
+# install python dependencies
+if [ -f "requirements.txt" ]; then
+    pip install -r requirements.txt
+fi
+if [ $? != 0 ]; then
+    echo "Install python dependencies failed !!!"
+    exit 1
+fi
+# install package Soundfile
+curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+if [ $? != 0 ]; then
+    echo "Download libsndfile-1.0.28.tar.gz failed !!!"
+    exit 1
+fi
+tar -zxvf libsndfile-1.0.28.tar.gz
+cd libsndfile-1.0.28
+./configure && make && make install
+cd -
+rm -rf libsndfile-1.0.28
+rm libsndfile-1.0.28.tar.gz
+pip install SoundFile==0.9.0.post1
+if [ $? != 0 ]; then
+    echo "Install SoundFile failed !!!"
+    exit 1
+fi
+# prepare ./checkpoints
+mkdir checkpoints
+echo "Install all dependencies successfully."
--- a/deep_speech_2/tests/test_error_rate.py
+++ b/deep_speech_2/tests/test_error_rate.py
+# -*- coding: utf-8 -*-
+"""Test error rate."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import error_rate
+class TestParse(unittest.TestCase):
+    def test_wer_1(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
+    def test_wer_2(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        word_error_rate = error_rate.wer(ref, ref)
+        self.assertEqual(word_error_rate, 0.0)
+    def test_wer_3(self):
+        ref = ' '
+        hyp = 'Hypothesis sentence'
+        with self.assertRaises(ValueError):
+            word_error_rate = error_rate.wer(ref, hyp)
+    def test_cer_1(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
+    def test_cer_2(self):
+        ref = 'werewolf'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertEqual(char_error_rate, 0.0)
+    def test_cer_3(self):
+        ref = u'我是中国人'
+        hyp = u'我是 美洲人'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
+    def test_cer_4(self):
+        ref = u'我是中国人'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertFalse(char_error_rate, 0.0)
+    def test_cer_5(self):
+        ref = ''
+        hyp = 'Hypothesis'
+        with self.assertRaises(ValueError):
+            char_error_rate = error_rate.cer(ref, hyp)
+if __name__ == '__main__':
+    unittest.main()
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
+"""Trainer for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import argparse
+import gzip
+import time
+import distutils.util
+import multiprocessing
+import paddle.v2 as paddle
+from model import deep_speech2
+from data_utils.data import DataGenerator
+import utils
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--batch_size", default=256, type=int, help="Minibatch size.")
+parser.add_argument(
+    "--num_passes",
+    default=200,
+    type=int,
+    help="Training pass number. (default: %(default)s)")
+parser.add_argument(
+    "--num_conv_layers",
+    default=2,
+    type=int,
+    help="Convolution layer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_rnn_layers",
+    default=3,
+    type=int,
+    help="RNN layer number. (default: %(default)s)")
+parser.add_argument(
+    "--rnn_layer_size",
+    default=512,
+    type=int,
+    help="RNN layer cell number. (default: %(default)s)")
+parser.add_argument(
+    "--adam_learning_rate",
+    default=5e-4,
+    type=float,
+    help="Learning rate for ADAM Optimizer. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--use_sortagrad",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use sortagrad or not. (default: %(default)s)")
+parser.add_argument(
+    "--max_duration",
+    default=27.0,
+    type=float,
+    help="Audios with duration larger than this will be discarded. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--min_duration",
+    default=0.0,
+    type=float,
+    help="Audios with duration smaller than this will be discarded. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--shuffle_method",
+    default='batch_shuffle_clipped',
+    type=str,
+    help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
+    "'batch_shuffle_batch'. (default: %(default)s)")
+parser.add_argument(
+    "--trainer_count",
+    default=8,
+    type=int,
+    help="Trainer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
+parser.add_argument(
+    "--mean_std_filepath",
+    default='mean_std.npz',
+    type=str,
+    help="Manifest path for normalizer. (default: %(default)s)")
+parser.add_argument(
+    "--train_manifest_path",
+    default='datasets/manifest.train',
+    type=str,
+    help="Manifest path for training. (default: %(default)s)")
+parser.add_argument(
+    "--dev_manifest_path",
+    default='datasets/manifest.dev',
+    type=str,
+    help="Manifest path for validation. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='datasets/vocab/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
+parser.add_argument(
+    "--init_model_path",
+    default=None,
+    type=str,
+    help="If set None, the training will start from scratch. "
+    "Otherwise, the training will resume from "
+    "the existing model of this path. (default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='[{"type": "shift", '
+    '"params": {"min_shift_ms": -5, "max_shift_ms": 5},'
+    '"prob": 1.0}]',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
+args = parser.parse_args()
+def train():
+    """DeepSpeech2 training."""
+    # initialize data generator
+    def data_generator():
+        return DataGenerator(
+            vocab_filepath=args.vocab_filepath,
+            mean_std_filepath=args.mean_std_filepath,
+            augmentation_config=args.augmentation_config,
+            max_duration=args.max_duration,
+            min_duration=args.min_duration,
+            num_threads=args.num_threads_data)
+    train_generator = data_generator()
+    test_generator = data_generator()
+    # create network config
+    # paddle.data_type.dense_array is used for variable batch input.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
+    audio_data = paddle.layer.data(
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
+    text_data = paddle.layer.data(
+        name="transcript_text",
+        type=paddle.data_type.integer_value_sequence(
+            train_generator.vocab_size))
+    cost = deep_speech2(
+        audio_data=audio_data,
+        text_data=text_data,
+        dict_size=train_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_size=args.rnn_layer_size,
+        is_inference=False)
+    # create/load parameters and optimizer
+    if args.init_model_path is None:
+        parameters = paddle.parameters.create(cost)
+    else:
+        if not os.path.isfile(args.init_model_path):
+            raise IOError("Invalid model!")
+        parameters = paddle.parameters.Parameters.from_tar(
+            gzip.open(args.init_model_path))
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+    # prepare data reader
+    train_batch_reader = train_generator.batch_reader_creator(
+        manifest_path=args.train_manifest_path,
+        batch_size=args.batch_size,
+        min_batch_size=args.trainer_count,
+        sortagrad=args.use_sortagrad if args.init_model_path is None else False,
+        shuffle_method=args.shuffle_method)
+    test_batch_reader = test_generator.batch_reader_creator(
+        manifest_path=args.dev_manifest_path,
+        batch_size=args.batch_size,
+        min_batch_size=1,  # must be 1, but will have errors.
+        sortagrad=False,
+        shuffle_method=None)
+    # create event handler
+    def event_handler(event):
+        global start_time, cost_sum, cost_counter
+        if isinstance(event, paddle.event.EndIteration):
+            cost_sum += event.cost
+            cost_counter += 1
+            if (event.batch_id + 1) % 100 == 0:
+                print("\nPass: %d, Batch: %d, TrainCost: %f" % (
+                    event.pass_id, event.batch_id + 1, cost_sum / cost_counter))
+                cost_sum, cost_counter = 0.0, 0
+                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
+                    parameters.to_tar(f)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.BeginPass):
+            start_time = time.time()
+            cost_sum, cost_counter = 0.0, 0
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=test_batch_reader, feeding=test_generator.feeding)
+            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
+                  (time.time() - start_time, event.pass_id, result.cost))
+            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                parameters.to_tar(f)
+    # run train
+    trainer.train(
+        reader=train_batch_reader,
+        event_handler=event_handler,
+        num_passes=args.num_passes,
+        feeding=train_generator.feeding)
+def main():
+    utils.print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+    train()
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/utils.py
+++ b/deep_speech_2/utils.py
+"""Contains common utility functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args() 
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----  Configuration Arguments -----")
+    for arg, value in vars(args).iteritems():
+        print("%s: %s" % (arg, value))
+    print("------------------------------------")
--- a/generate_sequence_by_rnn_lm/.gitignore
+++ b/generate_sequence_by_rnn_lm/.gitignore
+*.pyc
+*.tar.gz
+models
--- a/generate_sequence_by_rnn_lm/README.md
+++ b/generate_sequence_by_rnn_lm/README.md
+# 使用循环神经网语言模型生成文本
+语言模型(Language Model)是一个概率分布模型，简单来说，就是用来计算一个句子的概率的模型。利用它可以确定哪个词序列的可能性更大，或者给定若干个词，可以预测下一个最可能出现的词。语言模型是自然语言处理领域里一个重要的基础模型。
+## 应用场景
+**语言模型被应用在很多领域**，如：
+* **自动写作**：语言模型可以根据上文生成下一个词，递归下去可以生成整个句子、段落、篇章。
+* **QA**：语言模型可以根据Question生成Answer。
+* **机器翻译**：当前主流的机器翻译模型大多基于Encoder-Decoder模式，其中Decoder就是一个待条件的语言模型，用来生成目标语言。
+* **拼写检查**：语言模型可以计算出词序列的概率，一般在拼写错误处序列的概率会骤减，可以用来识别拼写错误并提供改正候选集。
+* **词性标注、句法分析、语音识别......**
+## 关于本例
+本例实现基于RNN的语言模型，以及利用语言模型生成文本，本例的目录结构如下：
+```text
+.
+├── data
+│   └── train_data_examples.txt        # 示例数据，可参考示例数据的格式，提供自己的数据
+├── config.py    # 配置文件，包括data、train、infer相关配置
+├── generate.py  # 预测任务脚本，即生成文本
+├── beam_search.py    # beam search 算法实现
+├── network_conf.py   # 本例中涉及的各种网络结构均定义在此文件中，希望进一步修改模型结构，请修改此文件
+├── reader.py    # 读取数据接口
+├── README.md
+├── train.py    # 训练任务脚本
+└── utils.py    # 定义通用的函数，例如：构建字典、加载字典等
+```
+## RNN 语言模型
+### 简介
+RNN是一个序列模型，基本思路是：在时刻$t$，将前一时刻$t-1$的隐藏层输出和$t$时刻的词向量一起输入到隐藏层从而得到时刻$t$的特征表示，然后用这个特征表示得到$t$时刻的预测输出，如此在时间维上递归下去。可以看出RNN善于使用上文信息、历史知识，具有“记忆”功能。理论上RNN能实现“长依赖”（即利用很久之前的知识），但在实际应用中发现效果并不理想，研究提出了LSTM和GRU等变种，通过引入门机制对传统RNN的记忆单元进行了改进，弥补了传统RNN在学习长序列时遇到的难题。本例模型使用了LSTM或GRU，可通过配置进行修改。下图是RNN（广义上包含了LSTM、GRU等）语言模型“循环”思想的示意图：
+<p align=center><img src='images/rnn.png' width='500px'/></p>
+### 模型实现
+本例中RNN语言模型的实现简介如下：
+- **定义模型参数**：`config.py`中定义了模型的参数变量。
+- **定义模型结构**：`network_conf.py`中的`rnn_lm`**函数**中定义了模型的**结构**，如下：
+    - 输入层：将输入的词（或字）序列映射成向量，即词向量层： `embedding`。
+    - 中间层：根据配置实现RNN层，将上一步得到的`embedding`向量序列作为输入。
+    - 输出层：使用`softmax`归一化计算单词的概率。
+    - loss：定义多类交叉熵作为模型的损失函数。
+- **训练模型**：`train.py`中的`main`方法实现了模型的训练，实现流程如下：
+    - 准备输入数据：建立并保存词典、构建train和test数据的reader。
+    - 初始化模型：包括模型的结构、参数。
+    - 构建训练器：demo中使用的是Adam优化算法。
+    - 定义回调函数：构建`event_handler`来跟踪训练过程中loss的变化，并在每轮训练结束时保存模型的参数。
+    - 训练：使用trainer训练模型。
+- **生成文本**：`generate.py` 实现了文本的生成，实现流程如下：
+    - 加载训练好的模型和词典文件。
+    - 读取`gen_file`文件，每行是一个句子的前缀，用[柱搜索算法(Beam Search)](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)根据前缀生成文本。
+    - 将生成的文本及其前缀保存到文件`gen_result`。
+## 使用说明
+运行本例的方法如下：
+* 1，运行`python train.py`命令，开始train模型（默认使用RNN），待训练结束。
+* 2，运行`python generate.py`运行文本生成。（输入的文本默认为`data/train_data_examples.txt`，生成的文本默认保存到`data/gen_result.txt`中。）
+**如果需要使用自己的语料、定制模型，需要修改`config.py`中的配置，细节和适配工作详情如下：**
+### 语料适配
+* 清洗语料：去除原文中空格、tab、乱码，按需去除数字、标点符号、特殊符号等。
+* 内容格式：每个句子占一行；每行中的各词之间使用一个空格符分开。
+* 按需要配置`config.py`中的如下参数：
+    ```python  
+     train_file = "data/train_data_examples.txt"
+     test_file = ""
+     vocab_file = "data/word_vocab.txt"
+     model_save_dir = "models"
+    ```
+    1. `train_file`：指定训练数据的路径，**需要预先分词**。
+    2. `test_file`：指定测试数据的路径，如果训练数据不为空，将在每个 `pass` 训练结束对指定的测试数据进行测试。
+    3. `vocab_file`：指定字典的路径，如果字典文件不存在，将会对训练语料进行词频统计，构建字典。
+    4. `model_save_dir`：指定模型保存的路径，如果指定的文件夹不存在，将会自动创建。
+### 构建字典的策略
+- 当指定的字典文件不存在时，将对训练数据进行词频统计，自动构建字典`config.py` 中有如下两个参数与构建字典有关：
+    ```python
+    max_word_num = 51200 - 2
+    cutoff_word_fre = 0
+    ```
+    1. `max_word_num`：指定字典中含有多少个词。
+    2. `cutoff_word_fre`：字典中词语在训练语料中出现的最低频率。
+- 加入指定了 `max_word_num = 5000`，并且 `cutoff_word_fre = 10`，词频统计发现训练语料中出现频率高于10次的词语仅有3000个，那么最终会取3000个词构成词典。
+- 构建词典时，会自动加入两个特殊符号：
+    1. `<unk>`：不出现在字典中的词
+    2. `<e>`：句子的结束符
+    *注：需要注意的是，词典越大生成的内容越丰富，但训练耗时越久。一般中文分词之后，语料中不同的词能有几万乃至几十万，如果`max_word_num`取值过小则导致`<unk>`占比过高，如果`max_word_num`取值较大，则严重影响训练速度（对精度也有影响）。所以，也有“按字”训练模型的方式，即：把每个汉字当做一个词，常用汉字也就几千个，使得字典的大小不会太大、不会丢失太多信息，但汉语中同一个字在不同词中语义相差很大，有时导致模型效果不理想。建议多试试、根据实际情况选择是“按词训练”还是“按字训练”。*
+### 模型适配、训练
+* 按需调整`config.py`中如下配置，来修改 rnn 语言模型的网络结果：
+    ```python
+    rnn_type = "lstm"  # "gru" or "lstm"
+    emb_dim = 256
+    hidden_size = 256
+    stacked_rnn_num = 2
+    ```
+    1. `rnn_type`：支持 ”gru“ 或者 ”lstm“ 两种参数，选择使用何种 RNN 单元。
+    2. `emb_dim`：设置词向量的维度。
+    3. `hidden_size`：设置 RNN 单元隐层大小。
+    4. `stacked_rnn_num`：设置堆叠 RNN 单元的个数，构成一个更深的模型。
+* 运行`python train.py`命令训练模型，模型将被保存到`model_save_dir`指定的目录。
+### 按需生成文本
+* 按需调整`config.py`中以下变量，详解如下：
+    ```python
+    gen_file = "data/train_data_examples.txt"
+    gen_result = "data/gen_result.txt"
+    max_gen_len = 25  # the max number of words to generate
+    beam_size = 5
+    model_path = "models/rnn_lm_pass_00000.tar.gz"
+    ```
+    1. `gen_file`：指定输入数据文件，每行是一个句子的前缀，**需要预先分词**。
+    2. `gen_result`：指定输出文件路径，生成结果将写入此文件。
+    3. `max_gen_len`：指定每一句生成的话最长长度，如果模型无法生成出`<e>`，当生成 `max_gen_len` 个词语后，生成过程会自动终止。
+    4. `beam_size`：Beam Search 算法每一步的展开宽度。
+    5. `model_path`：指定训练好的模型的路径。
+    其中，`gen_file` 中保存的是待生成的文本前缀，每个前缀占一行，形如：
+    ```text
+    若隐若现 地像 幽灵 , 像 死神
+    ```
+    将需要生成的文本前缀按此格式存入文件即可；
+* 运行`python generate.py`命令运行beam search 算法为输入前缀生成文本，下面是模型生成的结果：
+    ```text
+    81    若隐若现 地像 幽灵 , 像 死神
+    -12.2542    一样 。 他 是 个 怪物 <e>
+    -12.6889    一样 。 他 是 个 英雄 <e>
+    -13.9877    一样 。 他 是 我 的 敌人 <e>
+    -14.2741    一样 。 他 是 我 的 <e>
+    -14.6250    一样 。 他 是 我 的 朋友 <e>
+    ```
+    其中：
+    1. 第一行 `81    若隐若现 地像 幽灵 , 像 死神`以`\t`为分隔，共有两列：
+        - 第一列是输入前缀在训练样本集中的序号。
+        - 第二列是输入的前缀。
+    2. 第二 ~ `beam_size + 1` 行是生成结果，同样以 `\t` 分隔为两列：
+        - 第一列是该生成序列的对数概率（log probability）。
+        - 第二列是生成的文本序列，正常的生成结果会以符号`<e>`结尾，如果没有以`<e>`结尾，意味着超过了最大序列长度，生成强制终止。
--- a/generate_sequence_by_rnn_lm/beam_search.py
+++ b/generate_sequence_by_rnn_lm/beam_search.py
+#!/usr/bin/env python
+# coding=utf-8
+import os
+import math
+import numpy as np
+import paddle.v2 as paddle
+from utils import logger, load_reverse_dict
+__all__ = ["BeamSearch"]
+class BeamSearch(object):
+    """
+    Generating sequence by beam search
+    NOTE: this class only implements generating one sentence at a time.
+    """
+    def __init__(self, inferer, word_dict_file, beam_size=1, max_gen_len=100):
+        """
+        constructor method.
+        :param inferer: object of paddle.Inference that represents the entire
+            network to forward compute the test batch
+        :type inferer: paddle.Inference
+        :param word_dict_file: path of word dictionary file
+        :type word_dict_file: str
+        :param beam_size: expansion width in each iteration
+        :type param beam_size: int
+        :param max_gen_len: the maximum number of iterations
+        :type max_gen_len: int
+        """
+        self.inferer = inferer
+        self.beam_size = beam_size
+        self.max_gen_len = max_gen_len
+        self.ids_2_word = load_reverse_dict(word_dict_file)
+        logger.info("dictionay len = %d" % (len(self.ids_2_word)))
+        try:
+            self.eos_id = next(x[0] for x in self.ids_2_word.iteritems()
+                               if x[1] == "<e>")
+            self.unk_id = next(x[0] for x in self.ids_2_word.iteritems()
+                               if x[1] == "<unk>")
+        except StopIteration:
+            logger.fatal(("the word dictionay must contain an ending mark "
+                          "in the text generation task."))
+        self.candidate_paths = []
+        self.final_paths = []
+    def _top_k(self, softmax_out, k):
+        """
+        get indices of the words with k highest probablities.
+        NOTE: <unk> will be excluded if it is among the top k words, then word
+        with (k + 1)th highest probability will be returned.
+        :param softmax_out: probablity over the dictionary
+        :type softmax_out: narray
+        :param k: number of word indices to return
+        :type k: int
+        :return: indices of k words with highest probablities.
+        :rtype: list
+        """
+        ids = softmax_out.argsort()[::-1]
+        return ids[ids != self.unk_id][:k]
+    def _forward_batch(self, batch):
+        """
+        forward a test batch.
+        :params batch: the input data batch
+        :type batch: list
+        :return: probablities of the predicted word
+        :rtype: ndarray
+        """
+        return self.inferer.infer(input=batch, field=["value"])
+    def _beam_expand(self, next_word_prob):
+        """
+        In every iteration step, the model predicts the possible next words.
+        For each input sentence, the top k words is added to end of the original
+        sentence to form a new generated sentence.
+        :param next_word_prob: probablities of the next words
+        :type next_word_prob: ndarray
+        :return: the expanded new sentences.
+        :rtype: list
+        """
+        assert len(next_word_prob) == len(self.candidate_paths), (
+            "Wrong forward computing results!")
+        top_beam_words = np.apply_along_axis(self._top_k, 1, next_word_prob,
+                                             self.beam_size)
+        new_paths = []
+        for i, words in enumerate(top_beam_words):
+            old_path = self.candidate_paths[i]
+            for w in words:
+                log_prob = old_path["log_prob"] + math.log(next_word_prob[i][w])
+                gen_ids = old_path["ids"] + [w]
+                if w == self.eos_id:
+                    self.final_paths.append({
+                        "log_prob": log_prob,
+                        "ids": gen_ids
+                    })
+                else:
+                    new_paths.append({"log_prob": log_prob, "ids": gen_ids})
+        return new_paths
+    def _beam_shrink(self, new_paths):
+        """
+        to return the top beam_size generated sequences with the highest
+        probabilities at the end of evey generation iteration.
+        :param new_paths: all possible generated sentences
+        :type new_paths: list
+        :return: a state flag to indicate whether to stop beam search
+        :rtype: bool
+        """
+        if len(self.final_paths) >= self.beam_size:
+            max_candidate_log_prob = max(
+                new_paths, key=lambda x: x["log_prob"])["log_prob"]
+            min_complete_path_log_prob = min(
+                self.final_paths, key=lambda x: x["log_prob"])["log_prob"]
+            if min_complete_path_log_prob >= max_candidate_log_prob:
+                return True
+        new_paths.sort(key=lambda x: x["log_prob"], reverse=True)
+        self.candidate_paths = new_paths[:self.beam_size]
+        return False
+    def gen_a_sentence(self, input_sentence):
+        """
+        generating sequence for an given input
+        :param input_sentence: one input_sentence
+        :type input_sentence: list
+        :return: the generated word sequences
+        :rtype: list
+        """
+        self.candidate_paths = [{"log_prob": 0., "ids": input_sentence}]
+        input_len = len(input_sentence)
+        for i in range(self.max_gen_len):
+            next_word_prob = self._forward_batch(
+                [[x["ids"]] for x in self.candidate_paths])
+            new_paths = self._beam_expand(next_word_prob)
+            min_candidate_log_prob = min(
+                new_paths, key=lambda x: x["log_prob"])["log_prob"]
+            path_to_remove = [
+                path for path in self.final_paths
+                if path["log_prob"] < min_candidate_log_prob
+            ]
+            for p in path_to_remove:
+                self.final_paths.remove(p)
+            if self._beam_shrink(new_paths):
+                self.candidate_paths = []
+                break
+        gen_ids = sorted(
+            self.final_paths + self.candidate_paths,
+            key=lambda x: x["log_prob"],
+            reverse=True)[:self.beam_size]
+        self.final_paths = []
+        def _to_str(x):
+            text = " ".join(self.ids_2_word[idx]
+                            for idx in x["ids"][input_len:])
+            return "%.4f\t%s" % (x["log_prob"], text)
+        return map(_to_str, gen_ids)
--- a/generate_sequence_by_rnn_lm/config.py
+++ b/generate_sequence_by_rnn_lm/config.py
+#!/usr/bin/env python
+# coding=utf-8
+import os
+################## for building word dictionary  ##################
+max_word_num = 51200 - 2
+cutoff_word_fre = 0
+################## for training task  #########################
+# path of training data
+train_file = "data/train_data_examples.txt"
+# path of testing data, if testing file does not exist,
+# testing will not be performed at the end of each training pass
+test_file = ""
+# path of word dictionary, if this file does not exist,
+# word dictionary will be built from training data.
+vocab_file = "data/word_vocab.txt"
+# directory to save the trained model
+# create a new directory if the directoy does not exist
+model_save_dir = "models"
+batch_size = 32  # the number of training examples in one forward/backward pass
+num_passes = 20  # how many passes to train the model
+log_period = 50
+save_period_by_batches = 50
+use_gpu = True  # to use gpu or not
+trainer_count = 1  # number of trainer
+##################  for model configuration  ##################
+rnn_type = "lstm"  # "gru" or "lstm"
+emb_dim = 256
+hidden_size = 256
+stacked_rnn_num = 2
+##################  for text generation  ##################
+gen_file = "data/train_data_examples.txt"
+gen_result = "data/gen_result.txt"
+max_gen_len = 25  # the max number of words to generate
+beam_size = 5
+model_path = "models/rnn_lm_pass_00000.tar.gz"
+if not os.path.exists(model_save_dir):
+    os.mkdir(model_save_dir)
--- a/generate_sequence_by_rnn_lm/data/train_data_examples.txt
+++ b/generate_sequence_by_rnn_lm/data/train_data_examples.txt
+我们 不会 伤害 你 的 。 他们 也 这么 说 。
+你 拥有 你 父亲 皇室 的 血统 。 是 合法 的 继承人 。
+叫 什么 你 可以 告诉 我 。
+你 并 没有 留言 说 要 去 哪里 。 是 的 , 因为 我 必须 要 去 完成 这件 事 。
+你 查出 是 谁 住 在 隔壁 房间 吗 ?
--- a/generate_sequence_by_rnn_lm/generate.py
+++ b/generate_sequence_by_rnn_lm/generate.py
+# coding=utf-8
+import os
+import gzip
+import numpy as np
+import paddle.v2 as paddle
+from utils import logger, load_dict
+from beam_search import BeamSearch
+import config as conf
+from network_conf import rnn_lm
+def rnn_generate(gen_input_file, model_path, max_gen_len, beam_size,
+                 word_dict_file):
+    """
+    use RNN model to generate sequences.
+    :param word_id_dict: vocab.
+    :type word_id_dict: dictionary with content of "{word, id}",
+                        "word" is string type , "id" is int type.
+    :param num_words: the number of the words to generate.
+    :type num_words: int
+    :param beam_size: beam width.
+    :type beam_size: int
+    :return: save prediction results to output_file
+    """
+    assert os.path.exists(gen_input_file), "test file does not exist!"
+    assert os.path.exists(model_path), "trained model does not exist!"
+    assert os.path.exists(
+        word_dict_file), "word dictionary file does not exist!"
+    # load word dictionary
+    word_2_ids = load_dict(word_dict_file)
+    try:
+        UNK_ID = word_2_ids["<unk>"]
+    except KeyError:
+        logger.fatal("the word dictionary must contain a <unk> token!")
+        sys.exit(-1)
+    # initialize paddle
+    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
+    # load the trained model
+    pred_words = rnn_lm(
+        len(word_2_ids),
+        conf.emb_dim,
+        conf.hidden_size,
+        conf.stacked_rnn_num,
+        conf.rnn_type,
+        is_infer=True)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+    inferer = paddle.inference.Inference(
+        output_layer=pred_words, parameters=parameters)
+    generator = BeamSearch(inferer, word_dict_file, beam_size, max_gen_len)
+    # generate text
+    with open(conf.gen_file, "r") as fin, open(conf.gen_result, "w") as fout:
+        for idx, line in enumerate(fin):
+            fout.write("%d\t%s" % (idx, line))
+            for gen_res in generator.gen_a_sentence([
+                    word_2_ids.get(w, UNK_ID)
+                    for w in line.lower().strip().split()
+            ]):
+                fout.write("%s\n" % gen_res)
+            fout.write("\n")
+if __name__ == "__main__":
+    rnn_generate(conf.gen_file, conf.model_path, conf.max_gen_len,
+                 conf.beam_size, conf.vocab_file)
--- a/generate_sequence_by_rnn_lm/images/ngram.png
+++ b/generate_sequence_by_rnn_lm/images/ngram.png
--- a/generate_sequence_by_rnn_lm/images/rnn.png
+++ b/generate_sequence_by_rnn_lm/images/rnn.png
--- a/generate_sequence_by_rnn_lm/index.html
+++ b/generate_sequence_by_rnn_lm/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 使用循环神经网语言模型生成文本
+语言模型(Language Model)是一个概率分布模型，简单来说，就是用来计算一个句子的概率的模型。利用它可以确定哪个词序列的可能性更大，或者给定若干个词，可以预测下一个最可能出现的词。语言模型是自然语言处理领域里一个重要的基础模型。
+## 应用场景
+**语言模型被应用在很多领域**，如：
+* **自动写作**：语言模型可以根据上文生成下一个词，递归下去可以生成整个句子、段落、篇章。
+* **QA**：语言模型可以根据Question生成Answer。
+* **机器翻译**：当前主流的机器翻译模型大多基于Encoder-Decoder模式，其中Decoder就是一个待条件的语言模型，用来生成目标语言。
+* **拼写检查**：语言模型可以计算出词序列的概率，一般在拼写错误处序列的概率会骤减，可以用来识别拼写错误并提供改正候选集。
+* **词性标注、句法分析、语音识别......**
+## 关于本例
+本例实现基于RNN的语言模型，以及利用语言模型生成文本，本例的目录结构如下：
+```text
+.
+├── data
+│   └── train_data_examples.txt        # 示例数据，可参考示例数据的格式，提供自己的数据
+├── config.py    # 配置文件，包括data、train、infer相关配置
+├── generate.py  # 预测任务脚本，即生成文本
+├── beam_search.py    # beam search 算法实现
+├── network_conf.py   # 本例中涉及的各种网络结构均定义在此文件中，希望进一步修改模型结构，请修改此文件
+├── reader.py    # 读取数据接口
+├── README.md
+├── train.py    # 训练任务脚本
+└── utils.py    # 定义通用的函数，例如：构建字典、加载字典等
+```
+## RNN 语言模型
+### 简介
+RNN是一个序列模型，基本思路是：在时刻$t$，将前一时刻$t-1$的隐藏层输出和$t$时刻的词向量一起输入到隐藏层从而得到时刻$t$的特征表示，然后用这个特征表示得到$t$时刻的预测输出，如此在时间维上递归下去。可以看出RNN善于使用上文信息、历史知识，具有“记忆”功能。理论上RNN能实现“长依赖”（即利用很久之前的知识），但在实际应用中发现效果并不理想，研究提出了LSTM和GRU等变种，通过引入门机制对传统RNN的记忆单元进行了改进，弥补了传统RNN在学习长序列时遇到的难题。本例模型使用了LSTM或GRU，可通过配置进行修改。下图是RNN（广义上包含了LSTM、GRU等）语言模型“循环”思想的示意图：
+<p align=center><img src='images/rnn.png' width='500px'/></p>
+### 模型实现
+本例中RNN语言模型的实现简介如下：
+- **定义模型参数**：`config.py`中定义了模型的参数变量。
+- **定义模型结构**：`network_conf.py`中的`rnn_lm`**函数**中定义了模型的**结构**，如下：
+    - 输入层：将输入的词（或字）序列映射成向量，即词向量层： `embedding`。
+    - 中间层：根据配置实现RNN层，将上一步得到的`embedding`向量序列作为输入。
+    - 输出层：使用`softmax`归一化计算单词的概率。
+    - loss：定义多类交叉熵作为模型的损失函数。
+- **训练模型**：`train.py`中的`main`方法实现了模型的训练，实现流程如下：
+    - 准备输入数据：建立并保存词典、构建train和test数据的reader。
+    - 初始化模型：包括模型的结构、参数。
+    - 构建训练器：demo中使用的是Adam优化算法。
+    - 定义回调函数：构建`event_handler`来跟踪训练过程中loss的变化，并在每轮训练结束时保存模型的参数。
+    - 训练：使用trainer训练模型。
+- **生成文本**：`generate.py` 实现了文本的生成，实现流程如下：
+    - 加载训练好的模型和词典文件。
+    - 读取`gen_file`文件，每行是一个句子的前缀，用[柱搜索算法(Beam Search)](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)根据前缀生成文本。
+    - 将生成的文本及其前缀保存到文件`gen_result`。
+## 使用说明
+运行本例的方法如下：
+* 1，运行`python train.py`命令，开始train模型（默认使用RNN），待训练结束。
+* 2，运行`python generate.py`运行文本生成。（输入的文本默认为`data/train_data_examples.txt`，生成的文本默认保存到`data/gen_result.txt`中。）
+**如果需要使用自己的语料、定制模型，需要修改`config.py`中的配置，细节和适配工作详情如下：**
+### 语料适配
+* 清洗语料：去除原文中空格、tab、乱码，按需去除数字、标点符号、特殊符号等。
+* 内容格式：每个句子占一行；每行中的各词之间使用一个空格符分开。
+* 按需要配置`config.py`中的如下参数：
+    ```python  
+     train_file = "data/train_data_examples.txt"
+     test_file = ""
+     vocab_file = "data/word_vocab.txt"
+     model_save_dir = "models"
+    ```
+    1. `train_file`：指定训练数据的路径，**需要预先分词**。
+    2. `test_file`：指定测试数据的路径，如果训练数据不为空，将在每个 `pass` 训练结束对指定的测试数据进行测试。
+    3. `vocab_file`：指定字典的路径，如果字典文件不存在，将会对训练语料进行词频统计，构建字典。
+    4. `model_save_dir`：指定模型保存的路径，如果指定的文件夹不存在，将会自动创建。
+### 构建字典的策略
+- 当指定的字典文件不存在时，将对训练数据进行词频统计，自动构建字典`config.py` 中有如下两个参数与构建字典有关：
+    ```python
+    max_word_num = 51200 - 2
+    cutoff_word_fre = 0
+    ```
+    1. `max_word_num`：指定字典中含有多少个词。
+    2. `cutoff_word_fre`：字典中词语在训练语料中出现的最低频率。
+- 加入指定了 `max_word_num = 5000`，并且 `cutoff_word_fre = 10`，词频统计发现训练语料中出现频率高于10次的词语仅有3000个，那么最终会取3000个词构成词典。
+- 构建词典时，会自动加入两个特殊符号：
+    1. `<unk>`：不出现在字典中的词
+    2. `<e>`：句子的结束符
+    *注：需要注意的是，词典越大生成的内容越丰富，但训练耗时越久。一般中文分词之后，语料中不同的词能有几万乃至几十万，如果`max_word_num`取值过小则导致`<unk>`占比过高，如果`max_word_num`取值较大，则严重影响训练速度（对精度也有影响）。所以，也有“按字”训练模型的方式，即：把每个汉字当做一个词，常用汉字也就几千个，使得字典的大小不会太大、不会丢失太多信息，但汉语中同一个字在不同词中语义相差很大，有时导致模型效果不理想。建议多试试、根据实际情况选择是“按词训练”还是“按字训练”。*
+### 模型适配、训练
+* 按需调整`config.py`中如下配置，来修改 rnn 语言模型的网络结果：
+    ```python
+    rnn_type = "lstm"  # "gru" or "lstm"
+    emb_dim = 256
+    hidden_size = 256
+    stacked_rnn_num = 2
+    ```
+    1. `rnn_type`：支持 ”gru“ 或者 ”lstm“ 两种参数，选择使用何种 RNN 单元。
+    2. `emb_dim`：设置词向量的维度。
+    3. `hidden_size`：设置 RNN 单元隐层大小。
+    4. `stacked_rnn_num`：设置堆叠 RNN 单元的个数，构成一个更深的模型。
+* 运行`python train.py`命令训练模型，模型将被保存到`model_save_dir`指定的目录。
+### 按需生成文本
+* 按需调整`config.py`中以下变量，详解如下：
+    ```python
+    gen_file = "data/train_data_examples.txt"
+    gen_result = "data/gen_result.txt"
+    max_gen_len = 25  # the max number of words to generate
+    beam_size = 5
+    model_path = "models/rnn_lm_pass_00000.tar.gz"
+    ```
+    1. `gen_file`：指定输入数据文件，每行是一个句子的前缀，**需要预先分词**。
+    2. `gen_result`：指定输出文件路径，生成结果将写入此文件。
+    3. `max_gen_len`：指定每一句生成的话最长长度，如果模型无法生成出`<e>`，当生成 `max_gen_len` 个词语后，生成过程会自动终止。
+    4. `beam_size`：Beam Search 算法每一步的展开宽度。
+    5. `model_path`：指定训练好的模型的路径。
+    其中，`gen_file` 中保存的是待生成的文本前缀，每个前缀占一行，形如：
+    ```text
+    若隐若现 地像 幽灵 , 像 死神
+    ```
+    将需要生成的文本前缀按此格式存入文件即可；
+* 运行`python generate.py`命令运行beam search 算法为输入前缀生成文本，下面是模型生成的结果：
+    ```text
+    81    若隐若现 地像 幽灵 , 像 死神
+    -12.2542    一样 。 他 是 个 怪物 <e>
+    -12.6889    一样 。 他 是 个 英雄 <e>
+    -13.9877    一样 。 他 是 我 的 敌人 <e>
+    -14.2741    一样 。 他 是 我 的 <e>
+    -14.6250    一样 。 他 是 我 的 朋友 <e>
+    ```
+    其中：
+    1. 第一行 `81    若隐若现 地像 幽灵 , 像 死神`以`\t`为分隔，共有两列：
+        - 第一列是输入前缀在训练样本集中的序号。
+        - 第二列是输入的前缀。
+    2. 第二 ~ `beam_size + 1` 行是生成结果，同样以 `\t` 分隔为两列：
+        - 第一列是该生成序列的对数概率（log probability）。
+        - 第二列是生成的文本序列，正常的生成结果会以符号`<e>`结尾，如果没有以`<e>`结尾，意味着超过了最大序列长度，生成强制终止。
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/generate_sequence_by_rnn_lm/network_conf.py
+++ b/generate_sequence_by_rnn_lm/network_conf.py
+# coding=utf-8
+import paddle.v2 as paddle
+def rnn_lm(vocab_dim,
+           emb_dim,
+           hidden_size,
+           stacked_rnn_num,
+           rnn_type="lstm",
+           is_infer=False):
+    """
+    RNN language model definition.
+    :param vocab_dim: size of vocabulary.
+    :type vocab_dim: int
+    :param emb_dim: dimension of the embedding vector
+    :type emb_dim: int
+    :param rnn_type: the type of RNN cell.
+    :type rnn_type: int
+    :param hidden_size: number of hidden unit.
+    :type hidden_size: int
+    :param stacked_rnn_num: number of stacked rnn cell.
+    :type stacked_rnn_num: int
+    :return: cost and output layer of model.
+    :rtype: LayerOutput
+    """
+    # input layers
+    input = paddle.layer.data(
+        name="input", type=paddle.data_type.integer_value_sequence(vocab_dim))
+    if not is_infer:
+        target = paddle.layer.data(
+            name="target",
+            type=paddle.data_type.integer_value_sequence(vocab_dim))
+    # embedding layer
+    input_emb = paddle.layer.embedding(input=input, size=emb_dim)
+    # rnn layer
+    if rnn_type == "lstm":
+        for i in range(stacked_rnn_num):
+            rnn_cell = paddle.networks.simple_lstm(
+                input=rnn_cell if i else input_emb, size=hidden_size)
+    elif rnn_type == "gru":
+        for i in range(stacked_rnn_num):
+            rnn_cell = paddle.networks.simple_gru(
+                input=rnn_cell if i else input_emb, size=hidden_size)
+    else:
+        raise Exception("rnn_type error!")
+    # fc(full connected) and output layer
+    output = paddle.layer.fc(
+        input=[rnn_cell], size=vocab_dim, act=paddle.activation.Softmax())
+    if is_infer:
+        last_word = paddle.layer.last_seq(input=output)
+        return last_word
+    else:
+        cost = paddle.layer.classification_cost(input=output, label=target)
+        return cost, output
--- a/generate_sequence_by_rnn_lm/reader.py
+++ b/generate_sequence_by_rnn_lm/reader.py
+# coding=utf-8
+import collections
+import os
+MIN_LEN = 3
+MAX_LEN = 100
+def rnn_reader(file_name, word_dict):
+    """
+    create reader for RNN, each line is a sample.
+    :param file_name: file name.
+    :param min_sentence_length: sentence's min length.
+    :param max_sentence_length: sentence's max length.
+    :param word_dict: vocab with content of '{word, id}',
+                      'word' is string type , 'id' is int type.
+    :return: data reader.
+    """
+    def reader():
+        UNK_ID = word_dict['<unk>']
+        with open(file_name) as file:
+            for line in file:
+                words = line.strip().lower().split()
+                if len(words) < MIN_LEN or len(words) > MAX_LEN:
+                    continue
+                ids = [word_dict.get(w, UNK_ID)
+                       for w in words] + [word_dict['<e>']]
+                yield ids[:-1], ids[1:]
+    return reader
--- a/generate_sequence_by_rnn_lm/train.py
+++ b/generate_sequence_by_rnn_lm/train.py
+#!/usr/bin/env python
+# coding=utf-8
+import os
+import sys
+import gzip
+import pdb
+import paddle.v2 as paddle
+import config as conf
+import reader
+from network_conf import rnn_lm
+from utils import logger, build_dict, load_dict
+def train(topology,
+          train_reader,
+          test_reader,
+          model_save_dir="models",
+          num_passes=10):
+    """
+    train model.
+    :param topology: cost layer of the model to train.
+    :type topology: LayerOuput
+    :param train_reader: train data reader.
+    :type trainer_reader: collections.Iterable
+    :param test_reader: test data reader.
+    :type test_reader: collections.Iterable
+    :param model_save_dir: path to save the trained model
+    :type model_save_dir: str
+    :param num_passes: number of epoch
+    :type num_passes: int
+    """
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+    # initialize PaddlePaddle
+    paddle.init(
+        use_gpu=conf.use_gpu, gpu_id=3, trainer_count=conf.trainer_count)
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=1e-3),
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000))
+    # create parameters
+    parameters = paddle.parameters.create(topology)
+    # create trainer
+    trainer = paddle.trainer.SGD(
+        cost=topology, parameters=parameters, update_equation=adam_optimizer)
+    # define the event_handler callback
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if not event.batch_id % conf.log_period:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+            if (not event.batch_id %
+                    conf.save_period_by_batches) and event.batch_id:
+                save_name = os.path.join(model_save_dir,
+                                         "rnn_lm_pass_%05d_batch_%03d.tar.gz" %
+                                         (event.pass_id, event.batch_id))
+                with gzip.open(save_name, "w") as f:
+                    parameters.to_tar(f)
+        if isinstance(event, paddle.event.EndPass):
+            if test_reader is not None:
+                result = trainer.test(reader=test_reader)
+                logger.info("Test with Pass %d, %s" %
+                            (event.pass_id, result.metrics))
+            save_name = os.path.join(model_save_dir, "rnn_lm_pass_%05d.tar.gz" %
+                                     (event.pass_id))
+            with gzip.open(save_name, "w") as f:
+                parameters.to_tar(f)
+    logger.info("start training...")
+    trainer.train(
+        reader=train_reader, event_handler=event_handler, num_passes=num_passes)
+    logger.info("Training is finished.")
+def main():
+    # prepare vocab
+    if not (os.path.exists(conf.vocab_file) and
+            os.path.getsize(conf.vocab_file)):
+        logger.info(("word dictionary does not exist, "
+                     "build it from the training data"))
+        build_dict(conf.train_file, conf.vocab_file, conf.max_word_num,
+                   conf.cutoff_word_fre)
+    logger.info("load word dictionary.")
+    word_dict = load_dict(conf.vocab_file)
+    logger.info("dictionay size = %d" % (len(word_dict)))
+    cost = rnn_lm(
+        len(word_dict), conf.emb_dim, conf.hidden_size, conf.stacked_rnn_num,
+        conf.rnn_type)
+    # define reader
+    reader_args = {
+        "file_name": conf.train_file,
+        "word_dict": word_dict,
+    }
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.rnn_reader(**reader_args), buf_size=102400),
+        batch_size=conf.batch_size)
+    test_reader = None
+    if os.path.exists(conf.test_file) and os.path.getsize(conf.test_file):
+        test_reader = paddle.batch(
+            paddle.reader.shuffle(
+                reader.rnn_reader(**reader_args), buf_size=65536),
+            batch_size=config.batch_size)
+    train(
+        topology=cost,
+        train_reader=train_reader,
+        test_reader=test_reader,
+        model_save_dir=conf.model_save_dir,
+        num_passes=conf.num_passes)
+if __name__ == "__main__":
+    main()
--- a/generate_sequence_by_rnn_lm/utils.py
+++ b/generate_sequence_by_rnn_lm/utils.py
+#!/usr/bin/env python
+# coding=utf-8
+import os
+import logging
+from collections import defaultdict
+__all__ = ["build_dict", "load_dict"]
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.DEBUG)
+def build_dict(data_file,
+               save_path,
+               max_word_num,
+               cutoff_word_fre=5,
+               insert_extra_words=["<unk>", "<e>"]):
+    """
+    :param data_file: path of data file
+    :type data_file: str
+    :param save_path: path to save the word dictionary
+    :type save_path: str
+    :param vocab_max_size: if vocab_max_size is set, top vocab_max_size words
+        will be added into word vocabulary
+    :type vocab_max_size: int
+    :param cutoff_thd: if cutoff_thd is set, words whose frequencies are less
+        than cutoff_thd will not be added into word vocabulary.
+        NOTE that: vocab_max_size and cutoff_thd cannot be set at the same time
+    :type cutoff_word_fre: int
+    :param extra_keys: extra keys defined by users that added into the word
+        dictionary, ususally these keys include <unk>, start and ending marks
+    :type extra_keys: list
+    """
+    word_count = defaultdict(int)
+    with open(data_file, "r") as f:
+        for idx, line in enumerate(f):
+            if not (idx + 1) % 100000:
+                logger.debug("processing %d lines ... " % (idx + 1))
+            words = line.strip().lower().split()
+            for w in words:
+                word_count[w] += 1
+    sorted_words = sorted(
+        word_count.iteritems(), key=lambda x: x[1], reverse=True)
+    stop_pos = len(sorted_words) if sorted_words[-1][
+        1] > cutoff_word_fre else next(idx for idx, v in enumerate(sorted_words)
+                                       if v[1] < cutoff_word_fre)
+    stop_pos = min(max_word_num, stop_pos)
+    with open(save_path, "w") as fdict:
+        for w in insert_extra_words:
+            fdict.write("%s\t-1\n" % (w))
+        for idx, info in enumerate(sorted_words):
+            if idx == stop_pos: break
+            fdict.write("%s\t%d\n" % (info[0], info[-1]))
+def load_dict(dict_path):
+    """
+    load word dictionary from the given file. Each line of the give file is
+    a word in the word dictionary. The first column of the line, seperated by
+    TAB, is the key, while the line index is the value.
+    :param dict_path: path of word dictionary
+    :type dict_path: str
+    :return: the dictionary
+    :rtype: dict
+    """
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+def load_reverse_dict(dict_path):
+    """
+    load word dictionary from the given file. Each line of the give file is
+    a word in the word dictionary. The line index is the key, while the first
+    column of the line, seperated by TAB, is the value.
+    :param dict_path: path of word dictionary
+    :type dict_path: str
+    :return: the dictionary
+    :rtype: dict
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
--- a/hsigmoid/.gitignore
+++ b/hsigmoid/.gitignore
+*.pyc
+models
--- a/hsigmoid/README.md
+++ b/hsigmoid/README.md
+# Hsigmoid加速词向量训练
+## 背景介绍
+在自然语言处理领域中，传统做法通常使用one-hot向量来表示词，比如词典为['我', '你', '喜欢']，可以用[1,0,0]、[0,1,0]和[0,0,1]这三个向量分别表示'我'、'你'和'喜欢'。这种表示方式比较简洁，但是当词表很大时，容易产生维度爆炸问题；而且任意两个词的向量是正交的，向量包含的信息有限。为了避免或减轻one-hot表示的缺点，目前通常使用词向量来取代one-hot表示，词向量也就是word embedding，即使用一个低维稠密的实向量取代高维稀疏的one-hot向量。训练词向量的方法有很多种，神经网络模型是其中之一，包括CBOW、Skip-gram等，这些模型本质上都是一个分类模型，当词表较大即类别较多时，传统的softmax将非常消耗时间。PaddlePaddle提供了Hsigmoid Layer、NCE Layer，来加速模型的训练过程。本文主要介绍如何使用Hsigmoid Layer来加速训练，词向量相关内容请查阅PaddlePaddle Book中的[词向量章节](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)。
+## Hsigmoid Layer
+Hsigmoid Layer引用自论文\[[1](#参考文献)\]，Hsigmoid指Hierarchical-sigmoid，原理是通过构建一个分类二叉树来降低计算复杂度，二叉树中每个叶子节点代表一个类别，每个非叶子节点代表一个二类别分类器。例如我们一共有4个类别分别是0、1、2、3，softmax会分别计算4个类别的得分，然后归一化得到概率。当类别数很多时，计算每个类别的概率非常耗时，Hsigmoid Layer会根据类别数构建一个平衡二叉树，如下：
+<p align="center">
+<img src="images/binary_tree.png" width="220" hspace='10'/> <img src="images/path_to_1.png" width="220" hspace='10'/> <br/>
+图1. （a）为平衡二叉树，（b）为根节点到类别1的路径
+</p>
+二叉树中每个非叶子节点是一个二类别分类器（sigmoid），如果类别是0，则取左子节点继续分类判断，反之取右子节点，直至达到叶节点。按照这种方式，每个类别均对应一条路径，例如从root到类别1的路径编码为0、1。训练阶段我们按照真实类别对应的路径，依次计算对应分类器的损失，然后综合所有损失得到最终损失。预测阶段，模型会输出各个非叶节点分类器的概率，我们可以根据概率获取路径编码，然后遍历路径编码就可以得到最终预测类别。传统softmax的计算复杂度为N（N为词典大小），Hsigmoid可以将复杂度降至log(N)，详细理论细节可参照论文\[[1](#参考文献)\]。
+## 数据准备
+### PTB数据
+本文采用Penn Treebank (PTB)数据集（[Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)），共包含train、valid和test三个文件。其中使用train作为训练数据，valid作为测试数据。本文训练的是5-gram模型，即用每条数据的前4个词来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包[paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py)    ，自动做数据的下载与预处理。预处理会把数据集中的每一句话前后加上开始符号\<s>以及结束符号\<e>，然后依据窗口大小（本文为5），从头到尾每次向右滑动窗口并生成一条数据。如"I have a dream that one day"可以生成\<s> I have a dream、I have a dream that、have a dream that one、a dream that one day、dream that one day \<e>，PaddlePaddle会把词转换成id数据作为预处理的输出。
+### 自定义数据
+用户可以使用自己的数据集训练模型，自定义数据集最关键的地方是实现reader接口做数据处理，reader需要产生一个迭代器，迭代器负责解析文件中的每一行数据，返回一个python list，例如[1, 2, 3, 4, 5]，分别是第一个到第四个词在字典中的id，PaddlePaddle会进一步将该list转化成`paddle.data_type.inter_value`类型作为data layer的输入，一个封装样例如下：
+```python
+def reader_creator(filename, word_dict, n):
+    def reader():
+        with open(filename) as f:
+            UNK = word_dict['<unk>']
+            for l in f:
+                l = ['<s>'] + l.strip().split() + ['<e>']
+                if len(l) >= n:
+                    l = [word_dict.get(w, UNK) for w in l]
+                    for i in range(n, len(l) + 1):
+                        yield tuple(l[i - n:i])
+    return reader
+def train_data(filename, word_dict, n):
+    """
+    Reader interface for training data.
+    It returns a reader creator, each sample in the reader is a word ID tuple.
+    :param filename: path of data file
+    :type filename: str
+    :param word_dict: word dictionary
+    :type word_dict: dict
+    :param n: sliding window size
+    :type n: int
+    """
+    return reader_creator(filename, word_dict, n)
+```
+## 网络结构
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
+<p align="center">
+<img src="images/network_conf.png" width = "70%" align="center"/><br/>
+图2. 网络配置结构
+</p>
+代码实现如下：
+```python
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+    embed_context = paddle.layer.concat(input=emb_layers)
+    hidden_layer = paddle.layer.fc(
+        input=embed_context,
+        size=hidden_size,
+        act=paddle.activation.Sigmoid(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5),
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+    if is_train == True:
+        cost = paddle.layer.hsigmoid(
+            input=hidden_layer,
+            label=target_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
+        return cost
+    else:
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
+        return prediction
+```
+需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
+## 训练阶段
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+## 预测阶段
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+```python
+def decode_res(infer_res, dict_size):
+    """
+    Inferring probabilities are orginized as a complete binary tree.
+    The actual labels are leaves (indices are counted from class number).
+    This function travels paths decoded from inferring results.
+    If the probability >0.5 then go to right child, otherwise go to left child.
+    param infer_res: inferring result
+    param dict_size: class number
+    return predict_lbls: actual class
+    """
+    predict_lbls = []
+    infer_res = infer_res > 0.5
+    for i, probs in enumerate(infer_res):
+        idx = 0
+        result = 1
+        while idx < len(probs):
+            result <<= 1
+            if probs[idx]:
+                result |= 1
+            if probs[idx]:
+                idx = idx * 2 + 2  # right child
+            else:
+                idx = idx * 2 + 1  # left child
+        predict_lbl = result - dict_size
+        predict_lbls.append(predict_lbl)
+    return predict_lbls
+```
+预测程序的输入数据格式与训练阶段相同，如have a dream that one，程序会根据have a dream that生成一组概率，通过对概率解码生成预测词，one作为真实词，方便评估。解码函数的输入是一个batch样本的预测概率以及词表的大小，里面的循环是对每条样本的输出概率进行解码，解码方式就是按照左0右1的准则，不断遍历路径，直至到达叶子节点。需要注意的是，本文选用的数据集需要较长的时间训练才能得到较好的结果，预测程序选用第一轮的模型，仅为展示方便，学习效果不能保证。
+## 参考文献
+1. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
--- a/hsigmoid/images/binary_tree.png
+++ b/hsigmoid/images/binary_tree.png
--- a/hsigmoid/images/network_conf.png
+++ b/hsigmoid/images/network_conf.png
--- a/hsigmoid/images/path_to_1.png
+++ b/hsigmoid/images/path_to_1.png
--- a/hsigmoid/index.html
+++ b/hsigmoid/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Hsigmoid加速词向量训练
+## 背景介绍
+在自然语言处理领域中，传统做法通常使用one-hot向量来表示词，比如词典为['我', '你', '喜欢']，可以用[1,0,0]、[0,1,0]和[0,0,1]这三个向量分别表示'我'、'你'和'喜欢'。这种表示方式比较简洁，但是当词表很大时，容易产生维度爆炸问题；而且任意两个词的向量是正交的，向量包含的信息有限。为了避免或减轻one-hot表示的缺点，目前通常使用词向量来取代one-hot表示，词向量也就是word embedding，即使用一个低维稠密的实向量取代高维稀疏的one-hot向量。训练词向量的方法有很多种，神经网络模型是其中之一，包括CBOW、Skip-gram等，这些模型本质上都是一个分类模型，当词表较大即类别较多时，传统的softmax将非常消耗时间。PaddlePaddle提供了Hsigmoid Layer、NCE Layer，来加速模型的训练过程。本文主要介绍如何使用Hsigmoid Layer来加速训练，词向量相关内容请查阅PaddlePaddle Book中的[词向量章节](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)。
+## Hsigmoid Layer
+Hsigmoid Layer引用自论文\[[1](#参考文献)\]，Hsigmoid指Hierarchical-sigmoid，原理是通过构建一个分类二叉树来降低计算复杂度，二叉树中每个叶子节点代表一个类别，每个非叶子节点代表一个二类别分类器。例如我们一共有4个类别分别是0、1、2、3，softmax会分别计算4个类别的得分，然后归一化得到概率。当类别数很多时，计算每个类别的概率非常耗时，Hsigmoid Layer会根据类别数构建一个平衡二叉树，如下：
+<p align="center">
+<img src="images/binary_tree.png" width="220" hspace='10'/> <img src="images/path_to_1.png" width="220" hspace='10'/> <br/>
+图1. （a）为平衡二叉树，（b）为根节点到类别1的路径
+</p>
+二叉树中每个非叶子节点是一个二类别分类器（sigmoid），如果类别是0，则取左子节点继续分类判断，反之取右子节点，直至达到叶节点。按照这种方式，每个类别均对应一条路径，例如从root到类别1的路径编码为0、1。训练阶段我们按照真实类别对应的路径，依次计算对应分类器的损失，然后综合所有损失得到最终损失。预测阶段，模型会输出各个非叶节点分类器的概率，我们可以根据概率获取路径编码，然后遍历路径编码就可以得到最终预测类别。传统softmax的计算复杂度为N（N为词典大小），Hsigmoid可以将复杂度降至log(N)，详细理论细节可参照论文\[[1](#参考文献)\]。
+## 数据准备
+### PTB数据
+本文采用Penn Treebank (PTB)数据集（[Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)），共包含train、valid和test三个文件。其中使用train作为训练数据，valid作为测试数据。本文训练的是5-gram模型，即用每条数据的前4个词来预测第5个词。PaddlePaddle提供了对应PTB数据集的python包[paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py)    ，自动做数据的下载与预处理。预处理会把数据集中的每一句话前后加上开始符号\<s>以及结束符号\<e>，然后依据窗口大小（本文为5），从头到尾每次向右滑动窗口并生成一条数据。如"I have a dream that one day"可以生成\<s> I have a dream、I have a dream that、have a dream that one、a dream that one day、dream that one day \<e>，PaddlePaddle会把词转换成id数据作为预处理的输出。
+### 自定义数据
+用户可以使用自己的数据集训练模型，自定义数据集最关键的地方是实现reader接口做数据处理，reader需要产生一个迭代器，迭代器负责解析文件中的每一行数据，返回一个python list，例如[1, 2, 3, 4, 5]，分别是第一个到第四个词在字典中的id，PaddlePaddle会进一步将该list转化成`paddle.data_type.inter_value`类型作为data layer的输入，一个封装样例如下：
+```python
+def reader_creator(filename, word_dict, n):
+    def reader():
+        with open(filename) as f:
+            UNK = word_dict['<unk>']
+            for l in f:
+                l = ['<s>'] + l.strip().split() + ['<e>']
+                if len(l) >= n:
+                    l = [word_dict.get(w, UNK) for w in l]
+                    for i in range(n, len(l) + 1):
+                        yield tuple(l[i - n:i])
+    return reader
+def train_data(filename, word_dict, n):
+    """
+    Reader interface for training data.
+    It returns a reader creator, each sample in the reader is a word ID tuple.
+    :param filename: path of data file
+    :type filename: str
+    :param word_dict: word dictionary
+    :type word_dict: dict
+    :param n: sliding window size
+    :type n: int
+    """
+    return reader_creator(filename, word_dict, n)
+```
+## 网络结构
+本文通过训练N-gram语言模型来获得词向量，具体地使用前4个词来预测当前词。网络输入为词在字典中的id，然后查询词向量词表获取词向量，接着拼接4个词的词向量，然后接入一个全连接隐层，最后是`Hsigmoid`层。详细网络结构见图2：
+<p align="center">
+<img src="images/network_conf.png" width = "70%" align="center"/><br/>
+图2. 网络配置结构
+</p>
+代码实现如下：
+```python
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+    embed_context = paddle.layer.concat(input=emb_layers)
+    hidden_layer = paddle.layer.fc(
+        input=embed_context,
+        size=hidden_size,
+        act=paddle.activation.Sigmoid(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5),
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+    if is_train == True:
+        cost = paddle.layer.hsigmoid(
+            input=hidden_layer,
+            label=target_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
+        return cost
+    else:
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
+        return prediction
+```
+需要注意，在预测阶段，我们需要对hsigmoid参数做一次转置，这里输出的类别数为词典大小减1，对应非叶节点的数量。
+## 训练阶段
+训练比较简单，直接运行``` python train.py ```。程序第一次运行会检测用户缓存文件夹中是否包含imikolov数据集，如果未包含，则自动下载。运行过程中，每100个iteration会打印模型训练信息，主要包含训练损失和测试损失，每个pass会保存一次模型。
+## 预测阶段
+预测时，直接运行``` python infer.py ```，程序会首先load模型，然后按照batch方式进行预测，并打印预测结果。预测阶段最重要的就是根据概率得到编码路径，然后遍历路径获取最终的预测类别，这部分逻辑如下：
+```python
+def decode_res(infer_res, dict_size):
+    """
+    Inferring probabilities are orginized as a complete binary tree.
+    The actual labels are leaves (indices are counted from class number).
+    This function travels paths decoded from inferring results.
+    If the probability >0.5 then go to right child, otherwise go to left child.
+    param infer_res: inferring result
+    param dict_size: class number
+    return predict_lbls: actual class
+    """
+    predict_lbls = []
+    infer_res = infer_res > 0.5
+    for i, probs in enumerate(infer_res):
+        idx = 0
+        result = 1
+        while idx < len(probs):
+            result <<= 1
+            if probs[idx]:
+                result |= 1
+            if probs[idx]:
+                idx = idx * 2 + 2  # right child
+            else:
+                idx = idx * 2 + 1  # left child
+        predict_lbl = result - dict_size
+        predict_lbls.append(predict_lbl)
+    return predict_lbls
+```
+预测程序的输入数据格式与训练阶段相同，如have a dream that one，程序会根据have a dream that生成一组概率，通过对概率解码生成预测词，one作为真实词，方便评估。解码函数的输入是一个batch样本的预测概率以及词表的大小，里面的循环是对每条样本的输出概率进行解码，解码方式就是按照左0右1的准则，不断遍历路径，直至到达叶子节点。需要注意的是，本文选用的数据集需要较长的时间训练才能得到较好的结果，预测程序选用第一轮的模型，仅为展示方便，学习效果不能保证。
+## 参考文献
+1. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/hsigmoid/infer.py
+++ b/hsigmoid/infer.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
+import paddle.v2 as paddle
+from network_conf import ngram_lm
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)
+def decode_result(infer_res, dict_size):
+    """
+    Inferring probabilities are orginized as a complete binary tree.
+    The actual labels are leaves (indices are counted from class number).
+    This function travels paths decoded from inferring results.
+    If the probability >0.5 then go to right child, otherwise go to left child.
+    param infer_res: inferring result
+    param dict_size: class number
+    return predict_lbls: actual class
+    """
+    predict_lbls = []
+    infer_res = infer_res > 0.5
+    for i, probs in enumerate(infer_res):
+        idx = 0
+        result = 1
+        while idx < len(probs):
+            result <<= 1
+            if probs[idx]:
+                result |= 1
+            if probs[idx]:
+                idx = idx * 2 + 2  # right child
+            else:
+                idx = idx * 2 + 1  # left child
+        predict_lbl = result - dict_size
+        predict_lbls.append(predict_lbl)
+    return predict_lbls
+def infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer):
+    infer_res = inferer.infer(input=batch_ins)
+    predict_lbls = decode_result(infer_res, dict_size)
+    predict_words = [idx_word_dict[lbl] for lbl in predict_lbls]  # map to word
+    # Ouput format: word1 word2 word3 word4 -> predict label
+    for i, ins in enumerate(batch_ins):
+        print(" ".join([idx_word_dict[w]
+                        for w in ins]) + " -> " + predict_words[i])
+def infer(model_path, batch_size):
+    assert os.path.exists(model_path), "trained model does not exist."
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
+    dict_size = len(word_dict)
+    prediction_layer = ngram_lm(
+        is_train=False, hidden_size=256, embed_size=32, dict_size=dict_size)
+    with gzip.open(model_path, "r") as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    inferer = paddle.inference.Inference(
+        output_layer=prediction_layer, parameters=parameters)
+    idx_word_dict = dict((v, k) for k, v in word_dict.items())
+    batch_ins = []
+    for ins in paddle.dataset.imikolov.test(word_dict, 5)():
+        batch_ins.append(ins[:-1])
+        if len(batch_ins) == batch_size:
+            infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer)
+            batch_ins = []
+    if len(batch_ins) > 0:
+        infer_a_batch(batch_ins, idx_word_dict, dict_size, inferer)
+if __name__ == "__main__":
+    infer("models/hsigmoid_batch_00010.tar.gz", 20)
--- a/hsigmoid/network_conf.py
+++ b/hsigmoid/network_conf.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import math
+import paddle.v2 as paddle
+def ngram_lm(hidden_size, embed_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=embed_size, param_attr=embed_param_attr))
+    target_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+    embed_context = paddle.layer.concat(input=emb_layers)
+    hidden_layer = paddle.layer.fc(
+        input=embed_context,
+        size=hidden_size,
+        act=paddle.activation.Sigmoid(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5),
+        bias_attr=paddle.attr.Param(learning_rate=2),
+        param_attr=paddle.attr.Param(
+            initial_std=1. / math.sqrt(embed_size * 8), learning_rate=1))
+    if is_train == True:
+        cost = paddle.layer.hsigmoid(
+            input=hidden_layer,
+            label=target_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="sigmoid_w"),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"))
+        return cost
+    else:
+        prediction = paddle.layer.fc(
+            size=dict_size - 1,
+            input=hidden_layer,
+            act=paddle.activation.Sigmoid(),
+            bias_attr=paddle.attr.Param(name="sigmoid_b"),
+            param_attr=paddle.attr.Param(name="sigmoid_w"))
+        return prediction
--- a/hsigmoid/train.py
+++ b/hsigmoid/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import logging
+import gzip
+import paddle.v2 as paddle
+from network_conf import ngram_lm
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def main(save_dir="models"):
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict(min_word_freq=2)
+    dict_size = len(word_dict)
+    cost = ngram_lm(hidden_size=256, embed_size=32, dict_size=dict_size)
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndPass):
+            model_name = os.path.join(save_dir, "hsigmoid_pass_%05d.tar.gz" %
+                                      event.pass_id)
+            logger.info("Save model into %s ..." % model_name)
+            with gzip.open(model_name, "w") as f:
+                parameters.to_tar(f)
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id and event.batch_id % 10 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, 5), 32))
+                logger.info(
+                    "Pass %d, Batch %d, Cost %f, Test Cost %f" %
+                    (event.pass_id, event.batch_id, event.cost, result.cost))
+    parameters = paddle.parameters.create(cost)
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer.train(
+        paddle.batch(
+            paddle.reader.shuffle(
+                lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
+                buf_size=1000), 64),
+        num_passes=30,
+        event_handler=event_handler)
+if __name__ == "__main__":
+    main()
--- a/image_classification/README.md
+++ b/image_classification/README.md
+图像分类
+=======================
+这里将介绍如何在PaddlePaddle下使用AlexNet、VGG、GoogLeNet和ResNet模型进行图像分类。图像分类问题的描述和这四种模型的介绍可以参考[PaddlePaddle book](https://github.com/PaddlePaddle/book/tree/develop/03.image_classification)。
+## 训练模型
+### 初始化
+在初始化阶段需要导入所用的包，并对PaddlePaddle进行初始化。
+```python
+import gzip
+import paddle.v2.dataset.flowers as flowers
+import paddle.v2 as paddle
+import reader
+import vgg
+import resnet
+import alexnet
+import googlenet
+# PaddlePaddle init
+paddle.init(use_gpu=False, trainer_count=1)
+```
+### 定义参数和输入
+设置算法参数（如数据维度、类别数目和batch size等参数），定义数据输入层`image`和类别标签`lbl`。
+```python
+DATA_DIM = 3 * 224 * 224
+CLASS_DIM = 102
+BATCH_SIZE = 128
+image = paddle.layer.data(
+    name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+lbl = paddle.layer.data(
+    name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+```
+### 获得所用模型
+这里可以选择使用AlexNet、VGG、GoogLeNet和ResNet模型中的一个模型进行图像分类。通过调用相应的方法可以获得网络最后的Softmax层。
+1. 使用AlexNet模型
+指定输入层`image`和类别数目`CLASS_DIM`后，可以通过下面的代码得到AlexNet的Softmax层。
+```python
+out = alexnet.alexnet(image, class_dim=CLASS_DIM)
+```
+2. 使用VGG模型
+根据层数的不同，VGG分为VGG13、VGG16和VGG19。使用VGG16模型的代码如下：
+```python
+out = vgg.vgg16(image, class_dim=CLASS_DIM)
+```
+类似地，VGG13和VGG19可以分别通过`vgg.vgg13`和`vgg.vgg19`方法获得。
+3. 使用GoogLeNet模型
+GoogLeNet在训练阶段使用两个辅助的分类器强化梯度信息并进行额外的正则化。因此`googlenet.googlenet`共返回三个Softmax层，如下面的代码所示：
+```python
+out, out1, out2 = googlenet.googlenet(image, class_dim=CLASS_DIM)
+loss1 = paddle.layer.cross_entropy_cost(
+    input=out1, label=lbl, coeff=0.3)
+paddle.evaluator.classification_error(input=out1, label=lbl)
+loss2 = paddle.layer.cross_entropy_cost(
+    input=out2, label=lbl, coeff=0.3)
+paddle.evaluator.classification_error(input=out2, label=lbl)
+extra_layers = [loss1, loss2]
+```
+对于两个辅助的输出，这里分别对其计算损失函数并评价错误率，然后将损失作为后文SGD的extra_layers。
+4. 使用ResNet模型
+ResNet模型可以通过下面的代码获取：
+```python
+out = resnet.resnet_imagenet(image, class_dim=CLASS_DIM)
+```
+### 定义损失函数
+```python
+cost = paddle.layer.classification_cost(input=out, label=lbl)
+```
+### 创建参数和优化方法
+```python
+# Create parameters
+parameters = paddle.parameters.create(cost)
+# Create optimizer
+optimizer = paddle.optimizer.Momentum(
+    momentum=0.9,
+    regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                     BATCH_SIZE),
+    learning_rate=0.001 / BATCH_SIZE,
+    learning_rate_decay_a=0.1,
+    learning_rate_decay_b=128000 * 35,
+    learning_rate_schedule="discexp", )
+```
+通过 `learning_rate_decay_a` (简写$a$） 、`learning_rate_decay_b` (简写$b$) 和 `learning_rate_schedule` 指定学习率调整策略，这里采用离散指数的方式调节学习率，计算公式如下， $n$ 代表已经处理过的累计总样本数，$lr_{0}$ 即为参数里设置的 `learning_rate`。
+$$  lr = lr_{0} * a^ {\lfloor \frac{n}{ b}\rfloor} $$
+### 定义数据读取
+首先以[花卉数据](http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html)为例说明如何定义输入。下面的代码定义了花卉数据训练集和验证集的输入：
+```python
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        flowers.train(),
+        buf_size=1000),
+    batch_size=BATCH_SIZE)
+test_reader = paddle.batch(
+    flowers.valid(),
+    batch_size=BATCH_SIZE)
+```
+若需要使用其他数据，则需要先建立图像列表文件。`reader.py`定义了这种文件的读取方式，它从图像列表文件中解析出图像路径和类别标签。
+图像列表文件是一个文本文件，其中每一行由一个图像路径和类别标签构成，二者以跳格符（Tab）隔开。类别标签用整数表示，其最小值为0。下面给出一个图像列表文件的片段示例：
+```
+dataset_100/train_images/n03982430_23191.jpeg    1
+dataset_100/train_images/n04461696_23653.jpeg    7
+dataset_100/train_images/n02441942_3170.jpeg 8
+dataset_100/train_images/n03733281_31716.jpeg    2
+dataset_100/train_images/n03424325_240.jpeg  0
+dataset_100/train_images/n02643566_75.jpeg   8
+```
+训练时需要分别指定训练集和验证集的图像列表文件。这里假设这两个文件分别为`train.list`和`val.list`，数据读取方式如下：
+```python
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        reader.train_reader('train.list'),
+        buf_size=1000),
+    batch_size=BATCH_SIZE)
+test_reader = paddle.batch(
+    reader.test_reader('val.list'),
+    batch_size=BATCH_SIZE)
+```
+### 定义事件处理程序
+```python
+# End batch and end pass event handler
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 1 == 0:
+            print "\nPass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics)
+    if isinstance(event, paddle.event.EndPass):
+        with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
+            parameters.to_tar(f)
+        result = trainer.test(reader=test_reader)
+        print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+```
+### 定义训练方法
+对于AlexNet、VGG和ResNet，可以按下面的代码定义训练方法：
+```python
+# Create trainer
+trainer = paddle.trainer.SGD(
+    cost=cost,
+    parameters=parameters,
+    update_equation=optimizer)
+```
+GoogLeNet有两个额外的输出层，因此需要指定`extra_layers`，如下所示：
+```python
+# Create trainer
+trainer = paddle.trainer.SGD(
+    cost=cost,
+    parameters=parameters,
+    update_equation=optimizer,
+    extra_layers=extra_layers)
+```
+### 开始训练
+```python
+trainer.train(
+    reader=train_reader, num_passes=200, event_handler=event_handler)
+```
+## 应用模型
+模型训练好后，可以使用下面的代码预测给定图片的类别。
+```python
+# load parameters
+with gzip.open('params_pass_10.tar.gz', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+file_list = [line.strip() for line in open(image_list_file)]
+test_data = [(paddle.image.load_and_transform(image_file, 256, 224, False)
+              .flatten().astype('float32'), )
+             for image_file in file_list]
+probs = paddle.infer(
+    output_layer=out, parameters=parameters, input=test_data)
+lab = np.argsort(-probs)
+for file_name, result in zip(file_list, lab):
+    print "Label of %s is: %d" % (file_name, result[0])
+```
+首先从文件中加载训练好的模型（代码里以第10轮迭代的结果为例），然后读取`image_list_file`中的图像。`image_list_file`是一个文本文件，每一行为一个图像路径。代码使用`paddle.infer`判断`image_list_file`中每个图像的类别，并进行输出。
--- a/image_classification/alexnet.py
+++ b/image_classification/alexnet.py
+import paddle.v2 as paddle
+__all__ = ['alexnet']
+def alexnet(input, class_dim):
+    conv1 = paddle.layer.img_conv(
+        input=input,
+        filter_size=11,
+        num_channels=3,
+        num_filters=96,
+        stride=4,
+        padding=1)
+    cmrnorm1 = paddle.layer.img_cmrnorm(
+        input=conv1, size=5, scale=0.0001, power=0.75)
+    pool1 = paddle.layer.img_pool(input=cmrnorm1, pool_size=3, stride=2)
+    conv2 = paddle.layer.img_conv(
+        input=pool1,
+        filter_size=5,
+        num_filters=256,
+        stride=1,
+        padding=2,
+        groups=1)
+    cmrnorm2 = paddle.layer.img_cmrnorm(
+        input=conv2, size=5, scale=0.0001, power=0.75)
+    pool2 = paddle.layer.img_pool(input=cmrnorm2, pool_size=3, stride=2)
+    pool3 = paddle.networks.img_conv_group(
+        input=pool2,
+        pool_size=3,
+        pool_stride=2,
+        conv_num_filter=[384, 384, 256],
+        conv_filter_size=3,
+        pool_type=paddle.pooling.Max())
+    fc1 = paddle.layer.fc(
+        input=pool3,
+        size=4096,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(
+        input=fc1,
+        size=4096,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(
+        input=fc2, size=class_dim, act=paddle.activation.Softmax())
+    return out
--- a/image_classification/caffe2paddle/README.md
+++ b/image_classification/caffe2paddle/README.md
+## 使用说明
+`caffe2paddle.py`提供了将Caffe训练的模型转换为PaddlePaddle可使用的模型的接口`ModelConverter`，其封装了图像领域常用的Convolution、BatchNorm等layer的转换函数，可以完成VGG、ResNet等常用模型的转换。模型转换的基本过程是：基于Caffe的Python API加载模型并依次获取每一个layer的信息，将其中的参数根据layer类型与PaddlePaddle适配后序列化保存（对于Pooling等无需训练的layer不做处理），输出可以直接为PaddlePaddle的Python API加载使用的模型文件。
+可以按如下方法使用`ModelConverter`接口：
+```python
+# 定义以下变量为相应的文件路径和文件名
+caffe_model_file = "./ResNet-50-deploy.prototxt"        # Caffe网络配置文件的路径
+caffe_pretrained_file = "./ResNet-50-model.caffemodel"  # Caffe模型文件的路径
+paddle_tar_name = "Paddle_ResNet50.tar.gz"              # 输出的Paddle模型的文件名
+# 初始化，从指定文件加载模型
+converter = ModelConverter(caffe_model_file=caffe_model_file,
+                           caffe_pretrained_file=caffe_pretrained_file,
+                           paddle_tar_name=paddle_tar_name)
+# 进行模型转换
+converter.convert()
+```
+`caffe2paddle.py`中已提供以上步骤，修改其中文件相关变量的值后执行`python caffe2paddle.py`即可完成模型转换。此外，为辅助验证转换结果，`ModelConverter`中封装了使用Caffe API预测的接口`caffe_predict`，使用如下所示，将会打印按类别概率排序的(类别id, 概率)的列表:
+```python
+# img为图片路径，mean_file为图像均值文件的路径
+converter.caffe_predict(img="./cat.jpg", mean_file="./imagenet/ilsvrc_2012_mean.npy")
+```
+需要注意，在模型转换时会对layer的参数进行命名，这里默认使用PaddlePaddle中默认的layer和参数命名规则：以`wrap_name_default`中的值和该layer类型的调用计数构造layer name，并以此为前缀构造参数名，比如第一个InnerProduct层（相应转换函数说明见下方）的bias参数将被命名为`___fc_layer_0__.wbias`。
+```python
+# 对InnerProduct层的参数进行转换，使用name值构造对应layer的参数名
+# wrap_name_default设置默认name值为fc_layer
+@wrap_name_default("fc_layer")
+def convert_InnerProduct_layer(self, params, name=None)
+```
+为此，在验证和使用转换得到的模型时，编写PaddlePaddle网络配置无需指定layer name并且要保证和Caffe端模型使用同样的拓扑顺序，尤其是对于ResNet这种有分支的网络结构，要保证两分支在PaddlePaddle和Caffe中先后顺序一致，这样才能够使得模型参数正确加载。
+如果不希望使用默认的命名，并且在PaddlePaddle网络配置中指定了layer name，可以建立Caffe和PaddlePaddle网络配置间layer name对应关系的`dict`并在调用`ModelConverter.convert`时作为`name_map`的值传入，这样在命名保存layer中的参数时将使用相应的layer name，不受拓扑顺序的影响。另外这里只针对Caffe网络配置中Convolution、InnerProduct和BatchNorm类别的layer建立`name_map`即可（一方面，对于Pooling等无需训练的layer不需要保存，故这里没有提供转换接口；另一方面，对于Caffe中的Scale类别的layer，由于Caffe和PaddlePaddle在实现上的一些差别，PaddlePaddle中的batch_norm层是BatchNorm和Scale层的复合，故这里对Scale进行了特殊处理）。
--- a/image_classification/caffe2paddle/caffe2paddle.py
+++ b/image_classification/caffe2paddle/caffe2paddle.py
+import os
+import struct
+import gzip
+import tarfile
+import cStringIO
+import numpy as np
+import cv2
+import caffe
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+class ModelConverter(object):
+    def __init__(self, caffe_model_file, caffe_pretrained_file,
+                 paddle_tar_name):
+        self.net = caffe.Net(caffe_model_file, caffe_pretrained_file,
+                             caffe.TEST)
+        self.tar_name = paddle_tar_name
+        self.params = dict()
+        self.pre_layer_name = ""
+        self.pre_layer_type = ""
+    def convert(self, name_map=None):
+        layer_dict = self.net.layer_dict
+        for layer_name in layer_dict.keys():
+            layer = layer_dict[layer_name]
+            layer_params = layer.blobs
+            layer_type = layer.type
+            if len(layer_params) > 0:
+                self.pre_layer_name = getattr(
+                    self, "convert_" + layer_type + "_layer")(
+                        layer_params,
+                        name=None
+                        if name_map == None else name_map.get(layer_name))
+            self.pre_layer_type = layer_type
+        with gzip.open(self.tar_name, 'w') as f:
+            self.to_tar(f)
+        return
+    def to_tar(self, f):
+        tar = tarfile.TarFile(fileobj=f, mode='w')
+        for param_name in self.params.keys():
+            param_conf, param_data = self.params[param_name]
+            confStr = param_conf.SerializeToString()
+            tarinfo = tarfile.TarInfo(name="%s.protobuf" % param_name)
+            tarinfo.size = len(confStr)
+            buf = cStringIO.StringIO(confStr)
+            buf.seek(0)
+            tar.addfile(tarinfo, fileobj=buf)
+            buf = cStringIO.StringIO()
+            self.serialize(param_data, buf)
+            tarinfo = tarfile.TarInfo(name=param_name)
+            buf.seek(0)
+            tarinfo.size = len(buf.getvalue())
+            tar.addfile(tarinfo, buf)
+    @staticmethod
+    def serialize(data, f):
+        f.write(struct.pack("IIQ", 0, 4, data.size))
+        f.write(data.tobytes())
+    @wrap_name_default("conv")
+    def convert_Convolution_layer(self, params, name=None):
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            if len(params) == 2:
+                suffix = "0" if i == 0 else "bias"
+                file_name = "_%s.w%s" % (name, suffix)
+            else:
+                file_name = "_%s.w%s" % (name, str(i))
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            param_conf.size = reduce(lambda a, b: a * b, data.shape)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+    @wrap_name_default("fc_layer")
+    def convert_InnerProduct_layer(self, params, name=None):
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            if len(params) == 2:
+                suffix = "0" if i == 0 else "bias"
+                file_name = "_%s.w%s" % (name, suffix)
+            else:
+                file_name = "_%s.w%s" % (name, str(i))
+            data = np.transpose(data)
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            if len(dims) < 2:
+                dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+    @wrap_name_default("batch_norm")
+    def convert_BatchNorm_layer(self, params, name=None):
+        scale = 1 / np.array(params[-1].data)[0] if np.array(
+            params[-1].data)[0] != 0 else 0
+        for i in range(2):
+            data = np.array(params[i].data) * scale
+            file_name = "_%s.w%s" % (name, str(i + 1))
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            assert len(dims) == 1
+            dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+    def convert_Scale_layer(self, params, name=None):
+        assert self.pre_layer_type == "BatchNorm"
+        name = self.pre_layer_name
+        for i in range(len(params)):
+            data = np.array(params[i].data)
+            suffix = "0" if i == 0 else "bias"
+            file_name = "_%s.w%s" % (name, suffix)
+            param_conf = ParameterConfig()
+            param_conf.name = file_name
+            dims = list(data.shape)
+            assert len(dims) == 1
+            dims.insert(0, 1)
+            param_conf.size = reduce(lambda a, b: a * b, dims)
+            if i == 1:
+                param_conf.dims.extend(dims)
+            self.params[file_name] = (param_conf, data.flatten())
+        return name
+    def caffe_predict(self,
+                      img,
+                      mean_file='./caffe/imagenet/ilsvrc_2012_mean.npy'):
+        net = self.net
+        net.blobs['data'].data[...] = load_image(img, mean_file=mean_file)
+        out = net.forward()
+        output_prob = net.blobs['prob'].data[0].flatten()
+        print zip(np.argsort(output_prob)[::-1], np.sort(output_prob)[::-1])
+def load_image(file, resize_size=256, crop_size=224, mean_file=None):
+    # load image
+    im = cv2.imread(file)
+    # resize
+    h, w = im.shape[:2]
+    h_new, w_new = resize_size, resize_size
+    if h > w:
+        h_new = resize_size * h / w
+    else:
+        w_new = resize_size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    # crop
+    h, w = im.shape[:2]
+    h_start = (h - crop_size) / 2
+    w_start = (w - crop_size) / 2
+    h_end, w_end = h_start + crop_size, w_start + crop_size
+    im = im[h_start:h_end, w_start:w_end, :]
+    # transpose to CHW order
+    im = im.transpose((2, 0, 1))
+    if mean_file:
+        mu = np.load(mean_file)
+        mu = mu.mean(1).mean(1)
+        im = im - mu[:, None, None]
+    im = im / 255.0
+    return im
+if __name__ == "__main__":
+    caffe_model_file = "./ResNet-50-deploy.prototxt"
+    caffe_pretrained_file = "./ResNet-50-model.caffemodel"
+    paddle_tar_name = "Paddle_ResNet50.tar.gz"
+    converter = ModelConverter(
+        caffe_model_file=caffe_model_file,
+        caffe_pretrained_file=caffe_pretrained_file,
+        paddle_tar_name=paddle_tar_name)
+    converter.convert()
+    converter.caffe_predict("./cat.jpg",
+                            "./caffe/imagenet/ilsvrc_2012_mean.npy")
--- a/image_classification/googlenet.py
+++ b/image_classification/googlenet.py
+import paddle.v2 as paddle
+__all__ = ['googlenet']
+def inception(name, input, channels, filter1, filter3R, filter3, filter5R,
+              filter5, proj):
+    cov1 = paddle.layer.img_conv(
+        name=name + '_1',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter1,
+        stride=1,
+        padding=0)
+    cov3r = paddle.layer.img_conv(
+        name=name + '_3r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter3R,
+        stride=1,
+        padding=0)
+    cov3 = paddle.layer.img_conv(
+        name=name + '_3',
+        input=cov3r,
+        filter_size=3,
+        num_filters=filter3,
+        stride=1,
+        padding=1)
+    cov5r = paddle.layer.img_conv(
+        name=name + '_5r',
+        input=input,
+        filter_size=1,
+        num_channels=channels,
+        num_filters=filter5R,
+        stride=1,
+        padding=0)
+    cov5 = paddle.layer.img_conv(
+        name=name + '_5',
+        input=cov5r,
+        filter_size=5,
+        num_filters=filter5,
+        stride=1,
+        padding=2)
+    pool1 = paddle.layer.img_pool(
+        name=name + '_max',
+        input=input,
+        pool_size=3,
+        num_channels=channels,
+        stride=1,
+        padding=1)
+    covprj = paddle.layer.img_conv(
+        name=name + '_proj',
+        input=pool1,
+        filter_size=1,
+        num_filters=proj,
+        stride=1,
+        padding=0)
+    cat = paddle.layer.concat(name=name, input=[cov1, cov3, cov5, covprj])
+    return cat
+def googlenet(input, class_dim):
+    # stage 1
+    conv1 = paddle.layer.img_conv(
+        name="conv1",
+        input=input,
+        filter_size=7,
+        num_channels=3,
+        num_filters=64,
+        stride=2,
+        padding=3)
+    pool1 = paddle.layer.img_pool(
+        name="pool1", input=conv1, pool_size=3, num_channels=64, stride=2)
+    # stage 2
+    conv2_1 = paddle.layer.img_conv(
+        name="conv2_1",
+        input=pool1,
+        filter_size=1,
+        num_filters=64,
+        stride=1,
+        padding=0)
+    conv2_2 = paddle.layer.img_conv(
+        name="conv2_2",
+        input=conv2_1,
+        filter_size=3,
+        num_filters=192,
+        stride=1,
+        padding=1)
+    pool2 = paddle.layer.img_pool(
+        name="pool2", input=conv2_2, pool_size=3, num_channels=192, stride=2)
+    # stage 3
+    ince3a = inception("ince3a", pool2, 192, 64, 96, 128, 16, 32, 32)
+    ince3b = inception("ince3b", ince3a, 256, 128, 128, 192, 32, 96, 64)
+    pool3 = paddle.layer.img_pool(
+        name="pool3", input=ince3b, num_channels=480, pool_size=3, stride=2)
+    # stage 4
+    ince4a = inception("ince4a", pool3, 480, 192, 96, 208, 16, 48, 64)
+    ince4b = inception("ince4b", ince4a, 512, 160, 112, 224, 24, 64, 64)
+    ince4c = inception("ince4c", ince4b, 512, 128, 128, 256, 24, 64, 64)
+    ince4d = inception("ince4d", ince4c, 512, 112, 144, 288, 32, 64, 64)
+    ince4e = inception("ince4e", ince4d, 528, 256, 160, 320, 32, 128, 128)
+    pool4 = paddle.layer.img_pool(
+        name="pool4", input=ince4e, num_channels=832, pool_size=3, stride=2)
+    # stage 5
+    ince5a = inception("ince5a", pool4, 832, 256, 160, 320, 32, 128, 128)
+    ince5b = inception("ince5b", ince5a, 832, 384, 192, 384, 48, 128, 128)
+    pool5 = paddle.layer.img_pool(
+        name="pool5",
+        input=ince5b,
+        num_channels=1024,
+        pool_size=7,
+        stride=7,
+        pool_type=paddle.pooling.Avg())
+    dropout = paddle.layer.addto(
+        input=pool5,
+        layer_attr=paddle.attr.Extra(drop_rate=0.4),
+        act=paddle.activation.Linear())
+    out = paddle.layer.fc(
+        input=dropout, size=class_dim, act=paddle.activation.Softmax())
+    # fc for output 1
+    pool_o1 = paddle.layer.img_pool(
+        name="pool_o1",
+        input=ince4a,
+        num_channels=512,
+        pool_size=5,
+        stride=3,
+        pool_type=paddle.pooling.Avg())
+    conv_o1 = paddle.layer.img_conv(
+        name="conv_o1",
+        input=pool_o1,
+        filter_size=1,
+        num_filters=128,
+        stride=1,
+        padding=0)
+    fc_o1 = paddle.layer.fc(
+        name="fc_o1",
+        input=conv_o1,
+        size=1024,
+        layer_attr=paddle.attr.Extra(drop_rate=0.7),
+        act=paddle.activation.Relu())
+    out1 = paddle.layer.fc(
+        input=fc_o1, size=class_dim, act=paddle.activation.Softmax())
+    # fc for output 2
+    pool_o2 = paddle.layer.img_pool(
+        name="pool_o2",
+        input=ince4d,
+        num_channels=528,
+        pool_size=5,
+        stride=3,
+        pool_type=paddle.pooling.Avg())
+    conv_o2 = paddle.layer.img_conv(
+        name="conv_o2",
+        input=pool_o2,
+        filter_size=1,
+        num_filters=128,
+        stride=1,
+        padding=0)
+    fc_o2 = paddle.layer.fc(
+        name="fc_o2",
+        input=conv_o2,
+        size=1024,
+        layer_attr=paddle.attr.Extra(drop_rate=0.7),
+        act=paddle.activation.Relu())
+    out2 = paddle.layer.fc(
+        input=fc_o2, size=class_dim, act=paddle.activation.Softmax())
+    return out, out1, out2
--- a/image_classification/infer.py
+++ b/image_classification/infer.py
+import gzip
+import paddle.v2 as paddle
+import reader
+import vgg
+import resnet
+import alexnet
+import googlenet
+import argparse
+import os
+from PIL import Image
+import numpy as np
+WIDTH = 224
+HEIGHT = 224
+DATA_DIM = 3 * WIDTH * HEIGHT
+CLASS_DIM = 102
+def main():
+    # parse the argument
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'data_list',
+        help='The path of data list file, which consists of one image path per line'
+    )
+    parser.add_argument(
+        'model',
+        help='The model for image classification',
+        choices=['alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet'])
+    parser.add_argument(
+        'params_path', help='The file which stores the parameters')
+    args = parser.parse_args()
+    # PaddlePaddle init
+    paddle.init(use_gpu=True, trainer_count=1)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    if args.model == 'alexnet':
+        out = alexnet.alexnet(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg13':
+        out = vgg.vgg13(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg16':
+        out = vgg.vgg16(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg19':
+        out = vgg.vgg19(image, class_dim=CLASS_DIM)
+    elif args.model == 'resnet':
+        out = resnet.resnet_imagenet(image, class_dim=CLASS_DIM)
+    elif args.model == 'googlenet':
+        out, _, _ = googlenet.googlenet(image, class_dim=CLASS_DIM)
+    # load parameters
+    with gzip.open(args.params_path, 'r') as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    file_list = [line.strip() for line in open(args.data_list)]
+    test_data = [(paddle.image.load_and_transform(image_file, 256, 224, False)
+                  .flatten().astype('float32'), ) for image_file in file_list]
+    probs = paddle.infer(
+        output_layer=out, parameters=parameters, input=test_data)
+    lab = np.argsort(-probs)
+    for file_name, result in zip(file_list, lab):
+        print "Label of %s is: %d" % (file_name, result[0])
+if __name__ == '__main__':
+    main()
--- a/image_classification/reader.py
+++ b/image_classification/reader.py
+import random
+from paddle.v2.image import load_and_transform
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+def train_mapper(sample):
+    '''
+    map image path to type needed by model input layer for the training set
+    '''
+    img, label = sample
+    img = paddle.image.load_image(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+def test_mapper(sample):
+    '''
+    map image path to type needed by model input layer for the test set
+    '''
+    img, label = sample
+    img = paddle.image.load_image(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+def train_reader(train_list, buffered_size=1024):
+    def reader():
+        with open(train_list, 'r') as f:
+            lines = [line.strip() for line in f]
+            for line in lines:
+                img_path, lab = line.strip().split('\t')
+                yield img_path, int(lab)
+    return paddle.reader.xmap_readers(train_mapper, reader,
+                                      cpu_count(), buffered_size)
+def test_reader(test_list, buffered_size=1024):
+    def reader():
+        with open(test_list, 'r') as f:
+            lines = [line.strip() for line in f]
+            for line in lines:
+                img_path, lab = line.strip().split('\t')
+                yield img_path, int(lab)
+    return paddle.reader.xmap_readers(test_mapper, reader,
+                                      cpu_count(), buffered_size)
+if __name__ == '__main__':
+    for im in train_reader('train.list'):
+        print len(im[0])
+    for im in train_reader('test.list'):
+        print len(im[0])
--- a/image_classification/resnet.py
+++ b/image_classification/resnet.py
+import paddle.v2 as paddle
+__all__ = ['resnet_imagenet', 'resnet_cifar10']
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+def shortcut(input, ch_in, ch_out, stride):
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return input
+def basicblock(input, ch_in, ch_out, stride):
+    short = shortcut(input, ch_in, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, paddle.activation.Linear())
+    return paddle.layer.addto(
+        input=[short, conv2], act=paddle.activation.Relu())
+def bottleneck(input, ch_in, ch_out, stride):
+    short = shortcut(input, ch_in, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0,
+                          paddle.activation.Linear())
+    return paddle.layer.addto(
+        input=[short, conv3], act=paddle.activation.Relu())
+def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+    conv = block_func(input, ch_in, ch_out, stride)
+    for i in range(1, count):
+        conv = block_func(conv, ch_out, ch_out, 1)
+    return conv
+def resnet_imagenet(input, class_dim, depth=50):
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(
+        input, ch_in=3, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = paddle.layer.img_pool(input=conv1, pool_size=3, stride=2)
+    res1 = layer_warp(block_func, pool1, 64, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 64, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 128, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 256, 512, stages[3], 2)
+    pool2 = paddle.layer.img_pool(
+        input=res4, pool_size=7, stride=1, pool_type=paddle.pooling.Avg())
+    out = paddle.layer.fc(
+        input=pool2, size=class_dim, act=paddle.activation.Softmax())
+    return out
+def resnet_cifar10(input, class_dim, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        input, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    out = paddle.layer.fc(
+        input=pool, size=class_dim, act=paddle.activation.Softmax())
+    return out
--- a/image_classification/train.py
+++ b/image_classification/train.py
+import gzip
+import paddle.v2.dataset.flowers as flowers
+import paddle.v2 as paddle
+import reader
+import vgg
+import resnet
+import alexnet
+import googlenet
+import argparse
+DATA_DIM = 3 * 224 * 224
+CLASS_DIM = 102
+BATCH_SIZE = 128
+def main():
+    # parse the argument
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        'model',
+        help='The model for image classification',
+        choices=['alexnet', 'vgg13', 'vgg16', 'vgg19', 'resnet', 'googlenet'])
+    args = parser.parse_args()
+    # PaddlePaddle init
+    paddle.init(use_gpu=True, trainer_count=1)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+    extra_layers = None
+    learning_rate = 0.01
+    if args.model == 'alexnet':
+        out = alexnet.alexnet(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg13':
+        out = vgg.vgg13(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg16':
+        out = vgg.vgg16(image, class_dim=CLASS_DIM)
+    elif args.model == 'vgg19':
+        out = vgg.vgg19(image, class_dim=CLASS_DIM)
+    elif args.model == 'resnet':
+        out = resnet.resnet_imagenet(image, class_dim=CLASS_DIM)
+        learning_rate = 0.1
+    elif args.model == 'googlenet':
+        out, out1, out2 = googlenet.googlenet(image, class_dim=CLASS_DIM)
+        loss1 = paddle.layer.cross_entropy_cost(
+            input=out1, label=lbl, coeff=0.3)
+        paddle.evaluator.classification_error(input=out1, label=lbl)
+        loss2 = paddle.layer.cross_entropy_cost(
+            input=out2, label=lbl, coeff=0.3)
+        paddle.evaluator.classification_error(input=out2, label=lbl)
+        extra_layers = [loss1, loss2]
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            flowers.train(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        flowers.valid(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+        if isinstance(event, paddle.event.EndPass):
+            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+    # Create trainer
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        update_equation=optimizer,
+        extra_layers=extra_layers)
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+if __name__ == '__main__':
+    main()
\ No newline at end of file
--- a/image_classification/vgg.py
+++ b/image_classification/vgg.py
+import paddle.v2 as paddle
+__all__ = ['vgg13', 'vgg16', 'vgg19']
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+    fc_dim = 4096
+    fc1 = paddle.layer.fc(
+        input=conv5,
+        size=fc_dim,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(
+        input=fc1,
+        size=fc_dim,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(
+        input=fc2, size=class_dim, act=paddle.activation.Softmax())
+    return out
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
--- a/language_model/README.md
+++ b/language_model/README.md
-TBD
--- a/ltr/README.md
+++ b/ltr/README.md
-TBD
+# 排序学习(Learning To Rank)
+排序学习技术\[[1](#参考文献1)\]是构建排序模型的机器学习方法，在信息检索、自然语言处理，数据挖掘等机器学场景中具有重要作用。排序学习的主要目的是对给定一组文档，对任意查询请求给出反映相关性的文档排序。在本例子中，利用标注过的语料库训练两种经典排序模型RankNet[[4](#参考文献4)\]和LamdaRank[[6](#参考文献6)\]，分别可以生成对应的排序模型，能够对任意查询请求，给出相关性文档排序。
+RankNet模型在命令行输入：
+```python
+python ranknet.py
+```
+LambdaRank模型在命令行输入：
+```python
+python lambda_rank.py
+```
+用户只需要使用以上命令就完成排序模型的训练和预测，程序会自动下载内置数据集，无需手动下载。
+## 背景介绍
+排序学习技术随着互联网的快速增长而受到越来越多关注，是机器学习中的常见任务之一。一方面人工排序规则不能处理海量规模的候选数据，另一方面无法为不同渠道的候选数据给于合适的权重，因此排序学习在日常生活中应用非常广泛。排序学习起源于信息检索领域，目前仍然是许多信息检索场景中的核心模块，例如搜索引擎搜索结果排序，推荐系统候选集排序，在线广告排序等等。本例以文档检索任务阐述排序学习模型。
+<p align="center">
+<img src="images/search_engine_example.png" width="30%" ><br/>
+图1. 排序模型在文档检索的典型应用搜索引擎中的作用
+</p>
+假定有一组文档$S$，文档检索任务是依据和请求的相关性，给出文档排列顺序。查询引擎根据查询请求，排序模型会给每个文档打出分数，依据打分情况倒序排列文档，得到查询结果。在训练模型时，给定一条查询，并给出对应的文档最佳排序和得分。在预测时候，给出查询请求，排序模型生成文档排序。常见的排序学习方法划分为以下三类：
+- Pointwise 方法
+  Pointwise方法是通过近似为回归问题解决排序问题，输入的单条样本为**得分-文档**，将每个查询-文档对的相关性得分作为实数分数或者序数分数，使得单个查询-文档对作为样本点(Pointwise的由来)，训练排序模型。预测时候对于指定输入，给出查询-文档对的相关性得分。
+- Pairwise方法
+  Pairwise方法是通过近似为分类问题解决排序问题，输入的单条样本为**标签-文档对**。对于一次查询的多个结果文档，组合任意两个文档形成文档对作为输入样本。即学习一个二分类器，对输入的一对文档对AB（Pairwise的由来），根据A相关性是否比B好，二分类器给出分类标签1或0。对所有文档对进行分类，就可以得到一组偏序关系，从而构造文档全集的排序关系。该类方法的原理是对给定的文档全集$S$，降低排序中的逆序文档对的个数来降低排序错误，从而达到优化排序结果的目的。
+- Listwise方法
+  Listwise方法是直接优化排序列表，输入为单条样本为一个**文档排列**。通过构造合适的度量函数衡量当前文档排序和最优排序差值，优化度量函数得到排序模型。由于度量函数很多具有非连续性的性质，优化困难。
+<p align="center">
+<img src="images/learning_to_rank.jpg" width="50%" ><br/>
+图2. 排序模型三类方法
+</p>
+## 实验数据
+本例中的实验数据采用了排序学习中的基准数据[LETOR]([http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar](http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar))语料库，部分来自于Gov2网站的查询请求结果，包含了约1700条查询请求结果文档列表，并对文档相关性做出了人工标注。其中，一条查询含有唯一的查询id，对应于多个具有相关性的文档，构成了一次查询请求结果文档列表。每个文档由一个一维数组的特征向量表示，并对应一个人工标注与查询的相关性分数。
+本例在第一次运行的时会自动下载LETOR MQ2007数据集并缓存，无需手动下载。
+`mq2007`数据集分别提供了三种类型排序模型的生成格式，需要指定生成格式`format`
+例如调用接口
+```python
+pairwise_train_dataset = functools.partial(paddle.dataset.mq2007.train, format="pairwise")
+for label, left_doc, right_doc in pairwise_train_dataset():
+    ...
+```
+## 模型概览
+对于排序模型，本例中提供了Pairwise方法的模型RankNet和Listwise方法的模型LambdaRank，分别代表了两类学习方法。Pointwise方法的排序模型退化为回归问题，请参考PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一课。
+## RankNet排序模型
+[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)是一种经典的Pairwise的排序学习方法，是典型的前向神经网络排序模型。在文档集合$S$中的第$i$个文档记做$U_i$，它的文档特征向量记做$x_i$，对于给定的一个文档对$U_i$, $U_j$，RankNet将输入的单个文档特征向量$x$映射到$f(x)$，得到$s_i=f(x_i)$, $s_j=f(x_j)$。将$U_i$相关性比$U_j$好的概率记做$P_{i,j}$，则
+$$P_{i,j}=P(U_{i}>U_{j})=\frac{1}{1+e^{-\sigma (s_{i}-s_{j}))}}$$
+由于排序度量函数大多数非连续，非光滑，因此RankNet需要一个可以优化的度量函数$C$。首先使用交叉熵作为度量函数衡量预测代价，将损失函数$C$记做
+$$C_{i,j}=-\bar{P_{i,j}}logP_{i,j}-(1-\bar{P_{i,j}})log(1-P_{i,j})$$
+其中$\bar{P_{i,j}}$代表真实概率，记做
+$$\bar{P_{i,j}}=\frac{1}{2}(1+S_{i,j})$$
+而$S_{i,j}$ = {+1,0}，表示$U_i$和$U_j$组成的Pair的标签，即Ui相关性是否好于$U_j$。
+最终得到了可求导的度量损失函数
+$$C=\frac{1}{2}(1-S_{i,j})\sigma (s_{i}-s{j})+log(1+e^{-\sigma (s_{i}-s_{j})})$$
+可以使用常规的梯度下降方法进行优化。细节见[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+同时，得到文档$U_i$在排序优化过程的梯度信息为
+$$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\frac{1}{1+e^{\sigma (s_{i}-s_{j})}}$$
+表示的含义是本轮排序优化过程中文档$U_i$的上升或者下降量。
+根据以上推论构造RankNet网络结构，由若干层隐藏层和全连接层构成，如图所示，将文档特征使用隐藏层，全连接层逐层变换，完成了底层特征空间到高层特征空间的变换。其中docA和docB结构对称，分别输入到最终的RankCost层中。
+<p align="center">
+<img src="images/ranknet.jpg" width="50%" ><br/>
+图3. RankNet网络结构示意图
+</p>
+- 全连接层(fully connected layer) : 指上一层中的每个节点都连接到下层网络。本例子中同样使用`paddle.layer.fc`实现，注意输入到RankCost层的全连接层维度为1。
+- RankCost层： RankCost层是排序网络RankNet的核心，度量docA相关性是否比docB好，给出预测值并和label比较。使用了交叉熵(cross enctropy)作为度量损失函数，使用梯度下降方法进行优化。细节可见[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)[4]。
+由于Pairwise中的网络结构是左右对称，可定义一半网络结构，另一半共享网络参数。在PaddlePaddle中允许网络结构中共享连接，具有相同名字的参数将会共享参数。使用PaddlePaddle实现RankNet排序模型，定义网络结构的示例代码如下：
+```python
+import paddle.v2 as paddle
+def half_ranknet(name_prefix, input_dim):
+  """
+  parameter with a same name will be shared in PaddlePaddle framework,
+  these parameters in ranknet can be used in shared state, e.g. left network and right network in detail
+  https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
+  """
+  # data layer
+  data = paddle.layer.data(name_prefix+"/data", paddle.data_type.dense_vector(input_dim))
+  # fully connect layer
+  hd1 = paddle.layer.fc(
+    input=data,
+    size=10,
+    act=paddle.activation.Tanh(),
+    param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
+  # fully connected layer/ output layer
+  output = paddle.layer.fc(
+    input=hd1,
+    size=1,
+    act=paddle.activation.Linear(),
+    param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
+  return output
+def ranknet(input_dim):
+  # label layer
+  label = paddle.layer.data("label", paddle.data_type.integer_value(1))
+  # reuse the parameter in half_ranknet
+  output_left = half_ranknet("left", input_dim)  
+  output_right = half_ranknet("right", input_dim)
+  # rankcost layer
+  cost = paddle.layer.rank_cost(name="cost", left=output_left, right=output_right, label=label)
+  return cost
+```
+上述结构中使用了和图3相同的模型结构：两层隐藏层，分别是`hidden_size=10`的全连接层和`hidden_size=1`的全连接层。本例中的input_dim指输入**单个文档**的特征的维度，label取值为1，0。每条输入样本为`<label>，<docA, docB>`的结构，以docA为例，输入`input_dim`的文档特征，依次变换成10维，1维特征，最终输入到RankCost层中，比较docA和docB在RankCost输出得到预测值。
+### RankNet模型训练
+RankNet的训练只需要运行命令：
+```python
+python ranknet.py
+```
+将会自动下载数据，训练RankNet模型，并将每个轮次的模型参数存储下来。
+### RankNet模型预测
+本例提供了rankNet模型的训练和预测两个部分。完成训练后的模型分为拓扑结构(需要注意`rank_cost`不是模型拓扑结构的一部分)和模型参数文件两部分。在本例子中复用了`ranknet`训练时的模型拓扑结构`half_ranknet`，模型参数从外存中加载。模型预测的输入为单个文档的特征向量，模型会给出相关性得分。将预测得分排序即可得到最终的文档相关性排序结果。
+##  用户自定义RankNet数据
+上述的代码使用了PaddlePaddle内置的排序数据，如果希望使用自定义格式数据，可以参考PaddlePaddle内置的`mq2007`数据集，编写一个新的生成器函数。例如输入数据为如下格式，只包含doc0-doc2三个文档。
+\<query_id\> \<relevance_score\> \<feature_vector\>的格式(featureid: feature_value)
+```
+query_id : 1, relevance_score:1, feature_vector 0:0.1, 1:0.2, 2:0.4  #doc0
+query_id : 1, relevance_score:2, feature_vector 0:0.3, 1:0.1, 2:0.4  #doc1
+query_id : 1, relevance_score:0, feature_vector 0:0.2, 1:0.4, 2:0.1  #doc2
+query_id : 2, relevance_score:0, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc0
+.....
+```
+需要将输入样本转换为Pairwise的输入格式，例如组合生成格式与mq2007 Pairwise格式相同的结构
+\<label\> \<docA_feature_vector\>\<docB_feature_vector\>
+```
+1 doc1 doc0
+1 doc1 doc2
+1 doc0 doc2
+....
+```
+注意，一般在Pairwise格式的数据中，label=1表示docA和查询的相关性好于docB，事实上label信息隐含在docA和docB组合pair中。如果存在`0 docA docB`，交换顺序构造`1 docB docA`即可。
+另外组合所有的pair会有训练数据冗余，因为可以从部分偏序关系恢复文档集上的全序关系。相关研究见[PairWise approach](http://www.machinelearning.org/proceedings/icml2007/papers/139.pdf)[[5](#参考文献5)\]，本例不予赘述。
+```python
+# a customized data generator
+def gen_pairwise_data(text_line_of_data):
+    """
+      return :
+      ------
+      label : np.array, shape=(1)
+      docA_feature_vector : np.array, shape=(1, feature_dimension)
+      docA_feature_vector : np.array, shape=(1, feature_dimension)
+    """
+    return label, docA_feature_vector, docB_feature_vector
+```
+对应于paddle的输入中，`integer_value`为单个整数，`dense_vector`为实数一维向量，与生成器对应，需要在训练模型之前指明输入数据对应关系。
+```python
+# Define the input data order
+feeding = { "label":0,
+            "left/data" :1,
+            "right/data":2}
+```
+## LambdaRank排序模型
+[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)\[[6](#参考文献))\]是Listwise的排序方法，是Bugers[6]等人从RankNet发展而来，使用构造lambda函数(LambdaRank名字的由来)的方法优化度量标准NDCG(Normalized Discounted Cumulative Gain)，每个查询后得到的结果文档列表都单独作为一个训练样本。NDCG是信息论中很衡量文档列表排序质量的标准之一，前$K$个文档的NDCG得分记做
+$$NDCG@K=Z_{k}\sum (2^{rel_{i}})1/log(k+1)$$
+前文中RankNet中推导出，文档排序需要的是排序错误的梯度信息。NDCG度量函数是非光滑，非连续的，不能直接求得梯度信息，因此将|delta(NDCG)|=|NDCG(new) - NDCG(old)|引入，构造lambda函数为
+$$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma (s_{i}-s{j})}}|\Delta NDCG|$$
+替换RankNet中的梯度表示，得到的排序模型称为[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)
+由以上推导可知，LambdaRank网络结构和RankNet结构非常相似。如图所示
+<p align="center">
+<img src="images/lambdarank.jpg" width="50%" ><br/>
+图4. LambdaRank的网络结构示意图
+</p>
+一个查询得到的结果文档列表作为一条样本输入到网络中，替换RankCost为LambdaCost层，其他结构与RankNet相同。
+- LambdaCost层 : LambdaCost层使用NDCG差值作为Lambda函数，score是一个一维的序列，对于单调训练样本全连接层输出的是1x1的序列，二者的序列长度都等于该条查询得到的文档数量。Lambda函数的构造详细见[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)
+使用PaddlePaddle定义LambdaRank网络结构的示例代码如下：
+```python
+import paddle.v2 as paddle
+def lambda_rank(input_dim):
+    """
+    lambda_rank is a ListWise Rank Model, input data and label must be sequence
+    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
+    parameters :
+      input_dim, one document's dense feature vector dimension
+    dense_vector_sequence format
+    [[f, ...], [f, ...], ...], f is represent for an float or int number
+    """
+    label = paddle.layer.data("label",
+                              paddle.data_type.dense_vector_sequence(1))
+    data = paddle.layer.data("data",
+                             paddle.data_type.dense_vector_sequence(input_dim))
+    # hidden layer
+    hd1 = paddle.layer.fc(
+        input=data,
+        size=10,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    output = paddle.layer.fc(
+        input=hd1,
+        size=1,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    # cost layer
+    cost = paddle.layer.lambda_cost(
+        input=output, score=label, NDCG_num=6, max_sort_size=-1)
+    return cost, output
+```
+上述结构中使用了和图3相同的模型结构。和RankNet相似，分别使用了`hidden_size=10`和`hidden_size=1`的两个全连接层。本例中的input_dim指输入**单个文档**的特征的维度。每条输入样本为label，\<docA, docB\>的结构，以docA为例，输入input_dim的文档特征，依次变换成10维，1维特征，最终输入到LambdaCost层中。需要注意这里的label和data格式为**dense_vector_sequence**，表示一列文档得分或者文档特征组成的**序列**。
+### LambdaRank模型训练
+训练LambdaRank模型只需要运行命令：
+```python
+python lambda_rank.py
+```
+脚本会自动下载数据，训练LambdaRank模型，并将每个轮次的模型存储下来。
+### LambdaRank模型预测
+LambdaRank模型预测过程和RankNet相同。预测时的模型拓扑结构复用代码中的模型定义，从外存加载对应的参数文件。预测时的输入是文档列表，输出是该文档列表的各个文档相关性打分，根据打分对文档进行重新排序，即可得到最终的文档排序结果。
+## 自定义 LambdaRank数据
+上面的代码使用了PaddlePaddle内置的mq2007数据，如果希望使用自定义格式数据，可以参考PaddlePaddle内置的`mq2007`数据集，编写一个生成器函数。例如输入数据为如下格式，只包含doc0-doc2三个文档。
+\<query_id\> \<relevance_score\> \<feature_vector\>的格式
+```
+query_id : 1, relevance_score:1, feature_vector 0:0.1, 1:0.2, 2:0.4  #doc0
+query_id : 1, relevance_score:2, feature_vector 0:0.3, 1:0.1, 2:0.4  #doc1
+query_id : 1, relevance_score:0, feature_vector 0:0.2, 1:0.4, 2:0.1  #doc2
+query_id : 2, relevance_score:0, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc0
+query_id : 2, relevance_score:2, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc1
+.....
+```
+需要转换为Listwise格式，例如
+<query_id><relevance_score> <feature_vector>
+```tex
+1    1    0.1,0.2,0.4
+1    2    0.3,0.1,0.4
+1    0    0.2,0.4,0.1
+2    0    0.1,0.4,0.1
+2    2    0.1,0.4,0.1
+......
+```
+**数据格式注意**
+- 数据中每条样本对应的文档数量都必须大于`lambda_cost`层的NDCG_num
+- 若单条样本对应的文档都为0，文档相关性都为0，NDCG计算无效，那么可以判定该query无效，我们在训练中过滤掉了这样的query。
+```python
+# self define data generator
+def gen_listwise_data(text_all_lines_of_data):
+    """
+    return :
+    ------
+    label : np.array, shape=(samples_num, )
+    querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    return label_list, query_docs_feature_vector_matrix
+```
+对应于PaddlePaddle输入，`label`的类型为`dense_vector_sequence`，是得分的序列，`data`的类型为`dense_vector_sequence`，是特征向量的序列输入，`input_dim`为单个文档的一维特征向量维度，与生成器对应，需要在训练模型之前指明输入数据对应关系。
+```python
+# Define the input data order
+feeding = {"label":0,
+           "data" : 1}
+```
+## 总结
+LTR在实际生活中有着广泛的应用。排序模型构造方法一般可划分为PointWise方法，Pairwise方法，Listwise方法，本例以LETOR的mq2007数据为例子，阐述了Pairwise的经典方法RankNet和Listwise方法中的LambdaRank，展示如何使用PaddlePaddle框架构造对应的排序模型结构，并提供了自定义数据类型样例。PaddlePaddle提供了灵活的编程接口，并可以使用一套代码运行在单机单GPU和多机分布式多GPU下实现LTR类型任务。
+## 参考文献
+1. https://en.wikipedia.org/wiki/Learning_to_rank
+2. Liu T Y. [Learning to rank for information retrieval](http://ftp.nowpublishers.com/article/DownloadSummary/INR-016)[J]. Foundations and Trends® in Information Retrieval, 2009, 3(3): 225-331.
+3. Li H. [Learning to rank for information retrieval and natural language processing](http://www.morganclaypool.com/doi/abs/10.2200/S00607ED2V01Y201410HLT026)[J]. Synthesis Lectures on Human Language Technologies, 2014, 7(3): 1-121.
+4. Burges C, Shaked T, Renshaw E, et al. [Learning to rank using gradient descent](http://machinelearning.wustl.edu/mlpapers/paper_files/icml2005_BurgesSRLDHH05.pdf)[C]//Proceedings of the 22nd international conference on Machine learning. ACM, 2005: 89-96.
+5. Cao Z, Qin T, Liu T Y, et al. [Learning to rank: from pairwise approach to listwise approach](http://machinelearning.wustl.edu/mlpapers/paper_files/icml2007_CaoQLTL07.pdf)[C]//Proceedings of the 24th international conference on Machine learning. ACM, 2007: 129-136.
+6. Burges C J C, Ragno R, Le Q V. [Learning to rank with nonsmooth cost functions](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)[C]//NIPS. 2006, 6: 193-200.
--- a/ltr/images/lambdarank.jpg
+++ b/ltr/images/lambdarank.jpg
--- a/ltr/images/learning_to_rank.jpg
+++ b/ltr/images/learning_to_rank.jpg
--- a/ltr/images/ranknet.jpg
+++ b/ltr/images/ranknet.jpg
--- a/ltr/images/search_engine_example.png
+++ b/ltr/images/search_engine_example.png
--- a/ltr/index.html
+++ b/ltr/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 排序学习(Learning To Rank)
+排序学习技术\[[1](#参考文献1)\]是构建排序模型的机器学习方法，在信息检索、自然语言处理，数据挖掘等机器学场景中具有重要作用。排序学习的主要目的是对给定一组文档，对任意查询请求给出反映相关性的文档排序。在本例子中，利用标注过的语料库训练两种经典排序模型RankNet[[4](#参考文献4)\]和LamdaRank[[6](#参考文献6)\]，分别可以生成对应的排序模型，能够对任意查询请求，给出相关性文档排序。
+RankNet模型在命令行输入：
+```python
+python ranknet.py
+```
+LambdaRank模型在命令行输入：
+```python
+python lambda_rank.py
+```
+用户只需要使用以上命令就完成排序模型的训练和预测，程序会自动下载内置数据集，无需手动下载。
+## 背景介绍
+排序学习技术随着互联网的快速增长而受到越来越多关注，是机器学习中的常见任务之一。一方面人工排序规则不能处理海量规模的候选数据，另一方面无法为不同渠道的候选数据给于合适的权重，因此排序学习在日常生活中应用非常广泛。排序学习起源于信息检索领域，目前仍然是许多信息检索场景中的核心模块，例如搜索引擎搜索结果排序，推荐系统候选集排序，在线广告排序等等。本例以文档检索任务阐述排序学习模型。
+<p align="center">
+<img src="images/search_engine_example.png" width="30%" ><br/>
+图1. 排序模型在文档检索的典型应用搜索引擎中的作用
+</p>
+假定有一组文档$S$，文档检索任务是依据和请求的相关性，给出文档排列顺序。查询引擎根据查询请求，排序模型会给每个文档打出分数，依据打分情况倒序排列文档，得到查询结果。在训练模型时，给定一条查询，并给出对应的文档最佳排序和得分。在预测时候，给出查询请求，排序模型生成文档排序。常见的排序学习方法划分为以下三类：
+- Pointwise 方法
+  Pointwise方法是通过近似为回归问题解决排序问题，输入的单条样本为**得分-文档**，将每个查询-文档对的相关性得分作为实数分数或者序数分数，使得单个查询-文档对作为样本点(Pointwise的由来)，训练排序模型。预测时候对于指定输入，给出查询-文档对的相关性得分。
+- Pairwise方法
+  Pairwise方法是通过近似为分类问题解决排序问题，输入的单条样本为**标签-文档对**。对于一次查询的多个结果文档，组合任意两个文档形成文档对作为输入样本。即学习一个二分类器，对输入的一对文档对AB（Pairwise的由来），根据A相关性是否比B好，二分类器给出分类标签1或0。对所有文档对进行分类，就可以得到一组偏序关系，从而构造文档全集的排序关系。该类方法的原理是对给定的文档全集$S$，降低排序中的逆序文档对的个数来降低排序错误，从而达到优化排序结果的目的。
+- Listwise方法
+  Listwise方法是直接优化排序列表，输入为单条样本为一个**文档排列**。通过构造合适的度量函数衡量当前文档排序和最优排序差值，优化度量函数得到排序模型。由于度量函数很多具有非连续性的性质，优化困难。
+<p align="center">
+<img src="images/learning_to_rank.jpg" width="50%" ><br/>
+图2. 排序模型三类方法
+</p>
+## 实验数据
+本例中的实验数据采用了排序学习中的基准数据[LETOR]([http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar](http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar))语料库，部分来自于Gov2网站的查询请求结果，包含了约1700条查询请求结果文档列表，并对文档相关性做出了人工标注。其中，一条查询含有唯一的查询id，对应于多个具有相关性的文档，构成了一次查询请求结果文档列表。每个文档由一个一维数组的特征向量表示，并对应一个人工标注与查询的相关性分数。
+本例在第一次运行的时会自动下载LETOR MQ2007数据集并缓存，无需手动下载。
+`mq2007`数据集分别提供了三种类型排序模型的生成格式，需要指定生成格式`format`
+例如调用接口
+```python
+pairwise_train_dataset = functools.partial(paddle.dataset.mq2007.train, format="pairwise")
+for label, left_doc, right_doc in pairwise_train_dataset():
+    ...
+```
+## 模型概览
+对于排序模型，本例中提供了Pairwise方法的模型RankNet和Listwise方法的模型LambdaRank，分别代表了两类学习方法。Pointwise方法的排序模型退化为回归问题，请参考PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一课。
+## RankNet排序模型
+[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)是一种经典的Pairwise的排序学习方法，是典型的前向神经网络排序模型。在文档集合$S$中的第$i$个文档记做$U_i$，它的文档特征向量记做$x_i$，对于给定的一个文档对$U_i$, $U_j$，RankNet将输入的单个文档特征向量$x$映射到$f(x)$，得到$s_i=f(x_i)$, $s_j=f(x_j)$。将$U_i$相关性比$U_j$好的概率记做$P_{i,j}$，则
+$$P_{i,j}=P(U_{i}>U_{j})=\frac{1}{1+e^{-\sigma (s_{i}-s_{j}))}}$$
+由于排序度量函数大多数非连续，非光滑，因此RankNet需要一个可以优化的度量函数$C$。首先使用交叉熵作为度量函数衡量预测代价，将损失函数$C$记做
+$$C_{i,j}=-\bar{P_{i,j}}logP_{i,j}-(1-\bar{P_{i,j}})log(1-P_{i,j})$$
+其中$\bar{P_{i,j}}$代表真实概率，记做
+$$\bar{P_{i,j}}=\frac{1}{2}(1+S_{i,j})$$
+而$S_{i,j}$ = {+1,0}，表示$U_i$和$U_j$组成的Pair的标签，即Ui相关性是否好于$U_j$。
+最终得到了可求导的度量损失函数
+$$C=\frac{1}{2}(1-S_{i,j})\sigma (s_{i}-s{j})+log(1+e^{-\sigma (s_{i}-s_{j})})$$
+可以使用常规的梯度下降方法进行优化。细节见[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)
+同时，得到文档$U_i$在排序优化过程的梯度信息为
+$$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\frac{1}{1+e^{\sigma (s_{i}-s_{j})}}$$
+表示的含义是本轮排序优化过程中文档$U_i$的上升或者下降量。
+根据以上推论构造RankNet网络结构，由若干层隐藏层和全连接层构成，如图所示，将文档特征使用隐藏层，全连接层逐层变换，完成了底层特征空间到高层特征空间的变换。其中docA和docB结构对称，分别输入到最终的RankCost层中。
+<p align="center">
+<img src="images/ranknet.jpg" width="50%" ><br/>
+图3. RankNet网络结构示意图
+</p>
+- 全连接层(fully connected layer) : 指上一层中的每个节点都连接到下层网络。本例子中同样使用`paddle.layer.fc`实现，注意输入到RankCost层的全连接层维度为1。
+- RankCost层： RankCost层是排序网络RankNet的核心，度量docA相关性是否比docB好，给出预测值并和label比较。使用了交叉熵(cross enctropy)作为度量损失函数，使用梯度下降方法进行优化。细节可见[RankNet](http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf)[4]。
+由于Pairwise中的网络结构是左右对称，可定义一半网络结构，另一半共享网络参数。在PaddlePaddle中允许网络结构中共享连接，具有相同名字的参数将会共享参数。使用PaddlePaddle实现RankNet排序模型，定义网络结构的示例代码如下：
+```python
+import paddle.v2 as paddle
+def half_ranknet(name_prefix, input_dim):
+  """
+  parameter with a same name will be shared in PaddlePaddle framework,
+  these parameters in ranknet can be used in shared state, e.g. left network and right network in detail
+  https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
+  """
+  # data layer
+  data = paddle.layer.data(name_prefix+"/data", paddle.data_type.dense_vector(input_dim))
+  # fully connect layer
+  hd1 = paddle.layer.fc(
+    input=data,
+    size=10,
+    act=paddle.activation.Tanh(),
+    param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
+  # fully connected layer/ output layer
+  output = paddle.layer.fc(
+    input=hd1,
+    size=1,
+    act=paddle.activation.Linear(),
+    param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
+  return output
+def ranknet(input_dim):
+  # label layer
+  label = paddle.layer.data("label", paddle.data_type.integer_value(1))
+  # reuse the parameter in half_ranknet
+  output_left = half_ranknet("left", input_dim)  
+  output_right = half_ranknet("right", input_dim)
+  # rankcost layer
+  cost = paddle.layer.rank_cost(name="cost", left=output_left, right=output_right, label=label)
+  return cost
+```
+上述结构中使用了和图3相同的模型结构：两层隐藏层，分别是`hidden_size=10`的全连接层和`hidden_size=1`的全连接层。本例中的input_dim指输入**单个文档**的特征的维度，label取值为1，0。每条输入样本为`<label>，<docA, docB>`的结构，以docA为例，输入`input_dim`的文档特征，依次变换成10维，1维特征，最终输入到RankCost层中，比较docA和docB在RankCost输出得到预测值。
+### RankNet模型训练
+RankNet的训练只需要运行命令：
+```python
+python ranknet.py
+```
+将会自动下载数据，训练RankNet模型，并将每个轮次的模型参数存储下来。
+### RankNet模型预测
+本例提供了rankNet模型的训练和预测两个部分。完成训练后的模型分为拓扑结构(需要注意`rank_cost`不是模型拓扑结构的一部分)和模型参数文件两部分。在本例子中复用了`ranknet`训练时的模型拓扑结构`half_ranknet`，模型参数从外存中加载。模型预测的输入为单个文档的特征向量，模型会给出相关性得分。将预测得分排序即可得到最终的文档相关性排序结果。
+##  用户自定义RankNet数据
+上述的代码使用了PaddlePaddle内置的排序数据，如果希望使用自定义格式数据，可以参考PaddlePaddle内置的`mq2007`数据集，编写一个新的生成器函数。例如输入数据为如下格式，只包含doc0-doc2三个文档。
+\<query_id\> \<relevance_score\> \<feature_vector\>的格式(featureid: feature_value)
+```
+query_id : 1, relevance_score:1, feature_vector 0:0.1, 1:0.2, 2:0.4  #doc0
+query_id : 1, relevance_score:2, feature_vector 0:0.3, 1:0.1, 2:0.4  #doc1
+query_id : 1, relevance_score:0, feature_vector 0:0.2, 1:0.4, 2:0.1  #doc2
+query_id : 2, relevance_score:0, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc0
+.....
+```
+需要将输入样本转换为Pairwise的输入格式，例如组合生成格式与mq2007 Pairwise格式相同的结构
+\<label\> \<docA_feature_vector\>\<docB_feature_vector\>
+```
+1 doc1 doc0
+1 doc1 doc2
+1 doc0 doc2
+....
+```
+注意，一般在Pairwise格式的数据中，label=1表示docA和查询的相关性好于docB，事实上label信息隐含在docA和docB组合pair中。如果存在`0 docA docB`，交换顺序构造`1 docB docA`即可。
+另外组合所有的pair会有训练数据冗余，因为可以从部分偏序关系恢复文档集上的全序关系。相关研究见[PairWise approach](http://www.machinelearning.org/proceedings/icml2007/papers/139.pdf)[[5](#参考文献5)\]，本例不予赘述。
+```python
+# a customized data generator
+def gen_pairwise_data(text_line_of_data):
+    """
+      return :
+      ------
+      label : np.array, shape=(1)
+      docA_feature_vector : np.array, shape=(1, feature_dimension)
+      docA_feature_vector : np.array, shape=(1, feature_dimension)
+    """
+    return label, docA_feature_vector, docB_feature_vector
+```
+对应于paddle的输入中，`integer_value`为单个整数，`dense_vector`为实数一维向量，与生成器对应，需要在训练模型之前指明输入数据对应关系。
+```python
+# Define the input data order
+feeding = { "label":0,
+            "left/data" :1,
+            "right/data":2}
+```
+## LambdaRank排序模型
+[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)\[[6](#参考文献))\]是Listwise的排序方法，是Bugers[6]等人从RankNet发展而来，使用构造lambda函数(LambdaRank名字的由来)的方法优化度量标准NDCG(Normalized Discounted Cumulative Gain)，每个查询后得到的结果文档列表都单独作为一个训练样本。NDCG是信息论中很衡量文档列表排序质量的标准之一，前$K$个文档的NDCG得分记做
+$$NDCG@K=Z_{k}\sum (2^{rel_{i}})1/log(k+1)$$
+前文中RankNet中推导出，文档排序需要的是排序错误的梯度信息。NDCG度量函数是非光滑，非连续的，不能直接求得梯度信息，因此将|delta(NDCG)|=|NDCG(new) - NDCG(old)|引入，构造lambda函数为
+$$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma (s_{i}-s{j})}}|\Delta NDCG|$$
+替换RankNet中的梯度表示，得到的排序模型称为[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)
+由以上推导可知，LambdaRank网络结构和RankNet结构非常相似。如图所示
+<p align="center">
+<img src="images/lambdarank.jpg" width="50%" ><br/>
+图4. LambdaRank的网络结构示意图
+</p>
+一个查询得到的结果文档列表作为一条样本输入到网络中，替换RankCost为LambdaCost层，其他结构与RankNet相同。
+- LambdaCost层 : LambdaCost层使用NDCG差值作为Lambda函数，score是一个一维的序列，对于单调训练样本全连接层输出的是1x1的序列，二者的序列长度都等于该条查询得到的文档数量。Lambda函数的构造详细见[LambdaRank](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)
+使用PaddlePaddle定义LambdaRank网络结构的示例代码如下：
+```python
+import paddle.v2 as paddle
+def lambda_rank(input_dim):
+    """
+    lambda_rank is a ListWise Rank Model, input data and label must be sequence
+    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
+    parameters :
+      input_dim, one document's dense feature vector dimension
+    dense_vector_sequence format
+    [[f, ...], [f, ...], ...], f is represent for an float or int number
+    """
+    label = paddle.layer.data("label",
+                              paddle.data_type.dense_vector_sequence(1))
+    data = paddle.layer.data("data",
+                             paddle.data_type.dense_vector_sequence(input_dim))
+    # hidden layer
+    hd1 = paddle.layer.fc(
+        input=data,
+        size=10,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    output = paddle.layer.fc(
+        input=hd1,
+        size=1,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    # cost layer
+    cost = paddle.layer.lambda_cost(
+        input=output, score=label, NDCG_num=6, max_sort_size=-1)
+    return cost, output
+```
+上述结构中使用了和图3相同的模型结构。和RankNet相似，分别使用了`hidden_size=10`和`hidden_size=1`的两个全连接层。本例中的input_dim指输入**单个文档**的特征的维度。每条输入样本为label，\<docA, docB\>的结构，以docA为例，输入input_dim的文档特征，依次变换成10维，1维特征，最终输入到LambdaCost层中。需要注意这里的label和data格式为**dense_vector_sequence**，表示一列文档得分或者文档特征组成的**序列**。
+### LambdaRank模型训练
+训练LambdaRank模型只需要运行命令：
+```python
+python lambda_rank.py
+```
+脚本会自动下载数据，训练LambdaRank模型，并将每个轮次的模型存储下来。
+### LambdaRank模型预测
+LambdaRank模型预测过程和RankNet相同。预测时的模型拓扑结构复用代码中的模型定义，从外存加载对应的参数文件。预测时的输入是文档列表，输出是该文档列表的各个文档相关性打分，根据打分对文档进行重新排序，即可得到最终的文档排序结果。
+## 自定义 LambdaRank数据
+上面的代码使用了PaddlePaddle内置的mq2007数据，如果希望使用自定义格式数据，可以参考PaddlePaddle内置的`mq2007`数据集，编写一个生成器函数。例如输入数据为如下格式，只包含doc0-doc2三个文档。
+\<query_id\> \<relevance_score\> \<feature_vector\>的格式
+```
+query_id : 1, relevance_score:1, feature_vector 0:0.1, 1:0.2, 2:0.4  #doc0
+query_id : 1, relevance_score:2, feature_vector 0:0.3, 1:0.1, 2:0.4  #doc1
+query_id : 1, relevance_score:0, feature_vector 0:0.2, 1:0.4, 2:0.1  #doc2
+query_id : 2, relevance_score:0, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc0
+query_id : 2, relevance_score:2, feature_vector 0:0.1, 1:0.4, 2:0.1  #doc1
+.....
+```
+需要转换为Listwise格式，例如
+<query_id><relevance_score> <feature_vector>
+```tex
+1    1    0.1,0.2,0.4
+1    2    0.3,0.1,0.4
+1    0    0.2,0.4,0.1
+2    0    0.1,0.4,0.1
+2    2    0.1,0.4,0.1
+......
+```
+**数据格式注意**
+- 数据中每条样本对应的文档数量都必须大于`lambda_cost`层的NDCG_num
+- 若单条样本对应的文档都为0，文档相关性都为0，NDCG计算无效，那么可以判定该query无效，我们在训练中过滤掉了这样的query。
+```python
+# self define data generator
+def gen_listwise_data(text_all_lines_of_data):
+    """
+    return :
+    ------
+    label : np.array, shape=(samples_num, )
+    querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    return label_list, query_docs_feature_vector_matrix
+```
+对应于PaddlePaddle输入，`label`的类型为`dense_vector_sequence`，是得分的序列，`data`的类型为`dense_vector_sequence`，是特征向量的序列输入，`input_dim`为单个文档的一维特征向量维度，与生成器对应，需要在训练模型之前指明输入数据对应关系。
+```python
+# Define the input data order
+feeding = {"label":0,
+           "data" : 1}
+```
+## 总结
+LTR在实际生活中有着广泛的应用。排序模型构造方法一般可划分为PointWise方法，Pairwise方法，Listwise方法，本例以LETOR的mq2007数据为例子，阐述了Pairwise的经典方法RankNet和Listwise方法中的LambdaRank，展示如何使用PaddlePaddle框架构造对应的排序模型结构，并提供了自定义数据类型样例。PaddlePaddle提供了灵活的编程接口，并可以使用一套代码运行在单机单GPU和多机分布式多GPU下实现LTR类型任务。
+## 参考文献
+1. https://en.wikipedia.org/wiki/Learning_to_rank
+2. Liu T Y. [Learning to rank for information retrieval](http://ftp.nowpublishers.com/article/DownloadSummary/INR-016)[J]. Foundations and Trends® in Information Retrieval, 2009, 3(3): 225-331.
+3. Li H. [Learning to rank for information retrieval and natural language processing](http://www.morganclaypool.com/doi/abs/10.2200/S00607ED2V01Y201410HLT026)[J]. Synthesis Lectures on Human Language Technologies, 2014, 7(3): 1-121.
+4. Burges C, Shaked T, Renshaw E, et al. [Learning to rank using gradient descent](http://machinelearning.wustl.edu/mlpapers/paper_files/icml2005_BurgesSRLDHH05.pdf)[C]//Proceedings of the 22nd international conference on Machine learning. ACM, 2005: 89-96.
+5. Cao Z, Qin T, Liu T Y, et al. [Learning to rank: from pairwise approach to listwise approach](http://machinelearning.wustl.edu/mlpapers/paper_files/icml2007_CaoQLTL07.pdf)[C]//Proceedings of the 24th international conference on Machine learning. ACM, 2007: 129-136.
+6. Burges C J C, Ragno R, Le Q V. [Learning to rank with nonsmooth cost functions](https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf)[C]//NIPS. 2006, 6: 193-200.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/ltr/lambda_rank.py
+++ b/ltr/lambda_rank.py
+import os, sys
+import gzip
+import paddle.v2 as paddle
+import numpy as np
+import functools
+def lambda_rank(input_dim):
+    """
+    lambda_rank is a Listwise rank model, the input data and label must be sequences.
+    https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
+    parameters :
+      input_dim, one document's dense feature vector dimension
+    format of the dense_vector_sequence:
+    [[f, ...], [f, ...], ...], f is a float or an int number
+    """
+    label = paddle.layer.data("label",
+                              paddle.data_type.dense_vector_sequence(1))
+    data = paddle.layer.data("data",
+                             paddle.data_type.dense_vector_sequence(input_dim))
+    # hidden layer
+    hd1 = paddle.layer.fc(
+        input=data,
+        size=128,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    hd2 = paddle.layer.fc(
+        input=hd1,
+        size=10,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    output = paddle.layer.fc(
+        input=hd2,
+        size=1,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(initial_std=0.01))
+    # evaluator
+    evaluator = paddle.evaluator.auc(input=output, label=label)
+    # cost layer
+    cost = paddle.layer.lambda_cost(
+        input=output, score=label, NDCG_num=6, max_sort_size=-1)
+    return cost, output
+def train_lambda_rank(num_passes):
+    # listwise input sequence
+    fill_default_train = functools.partial(
+        paddle.dataset.mq2007.train, format="listwise")
+    fill_default_test = functools.partial(
+        paddle.dataset.mq2007.test, format="listwise")
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
+    test_reader = paddle.batch(fill_default_test, batch_size=32)
+    # mq2007 input_dim = 46, dense format
+    input_dim = 46
+    cost, output = lambda_rank(input_dim)
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
+    #  Define end batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
+                                                  event.cost)
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+            with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id),
+                           "w") as f:
+                parameters.to_tar(f)
+    feeding = {"label": 0, "data": 1}
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=num_passes)
+def lambda_rank_infer(pass_id):
+    """
+  lambda_rank model inference interface
+  parameters:
+    pass_id : inference model in pass_id
+  """
+    print "Begin to Infer..."
+    input_dim = 46
+    output = lambda_rank(input_dim)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1)))
+    infer_query_id = None
+    infer_data = []
+    infer_data_num = 1
+    fill_default_test = functools.partial(
+        paddle.dataset.mq2007.test, format="listwise")
+    for label, querylist in fill_default_test():
+        infer_data.append(querylist)
+        if len(infer_data) == infer_data_num:
+            break
+    # predict score of infer_data document. Re-sort the document base on predict score
+    # in descending order. then we build the ranking documents
+    predicitons = paddle.infer(
+        output_layer=output, parameters=parameters, input=infer_data)
+    for i, score in enumerate(predicitons):
+        print i, score
+if __name__ == '__main__':
+    paddle.init(use_gpu=False, trainer_count=1)
+    train_lambda_rank(2)
+    lambda_rank_infer(pass_id=1)
--- a/ltr/metrics.py
+++ b/ltr/metrics.py
+import numpy as np
+import unittest
+def ndcg(score_list):
+    """
+    measure the ndcg score of order list
+    https://en.wikipedia.org/wiki/Discounted_cumulative_gain
+    parameter:
+        score_list: np.array, shape=(sample_num,1)
+    e.g. predict rank score list :
+    >>> scores =  [3, 2, 3, 0, 1, 2] 
+    >>> ndcg_score = ndcg(scores)
+    """
+    def dcg(score_list):
+        n = len(score_list)
+        cost = .0
+        for i in range(n):
+            cost += float(score_list[i]) / np.log((i + 1) + 1)
+        return cost
+    dcg_cost = dcg(score_list)
+    score_ranking = sorted(score_list, reverse=True)
+    ideal_cost = dcg(score_ranking)
+    return dcg_cost / ideal_cost
+class NdcgTest(unittest.TestCase):
+    def __init__(self):
+        pass
+    def runcase(self):
+        a = [3, 2, 3, 0, 1, 2]
+        value = ndcg(a)
+        self.assertAlmostEqual(0.961, value, places=3)
+if __name__ == '__main__':
+    unittest.main()
--- a/ltr/ranknet.py
+++ b/ltr/ranknet.py
+import os
+import sys
+import gzip
+import functools
+import paddle.v2 as paddle
+import numpy as np
+from metrics import ndcg
+# ranknet is the classic pairwise learning to rank algorithm
+# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
+def half_ranknet(name_prefix, input_dim):
+    """
+  parameter in same name will be shared in paddle framework,
+  these parameters in ranknet can be used in shared state, e.g. left network and right network
+  shared parameters in detail
+  https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md
+  """
+    # data layer
+    data = paddle.layer.data(name_prefix + "/data",
+                             paddle.data_type.dense_vector(input_dim))
+    # hidden layer
+    hd1 = paddle.layer.fc(
+        input=data,
+        size=10,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
+    # fully connect layer/ output layer
+    output = paddle.layer.fc(
+        input=hd1,
+        size=1,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
+    return output
+def ranknet(input_dim):
+    # label layer
+    label = paddle.layer.data("label", paddle.data_type.dense_vector(1))
+    # reuse the parameter in half_ranknet
+    output_left = half_ranknet("left", input_dim)
+    output_right = half_ranknet("right", input_dim)
+    evaluator = paddle.evaluator.auc(input=output_left, label=label)
+    # rankcost layer
+    cost = paddle.layer.rank_cost(
+        name="cost", left=output_left, right=output_right, label=label)
+    return cost
+def train_ranknet(num_passes):
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100),
+        batch_size=100)
+    test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100)
+    # mq2007 feature_dim = 46, dense format
+    # fc hidden_dim = 128
+    feature_dim = 46
+    cost = ranknet(feature_dim)
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        update_equation=paddle.optimizer.Adam(learning_rate=2e-4))
+    # Define the input data order
+    feeding = {"label": 0, "left/data": 1, "right/data": 2}
+    #  Define end batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d Batch %d Cost %.9f" % (
+                    event.pass_id, event.batch_id, event.cost)
+            else:
+                sys.stdout.write(".")
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+            with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id),
+                           "w") as f:
+                parameters.to_tar(f)
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=num_passes)
+def ranknet_infer(pass_id):
+    """
+  load the trained model. And predict with plain txt input
+  """
+    print "Begin to Infer..."
+    feature_dim = 46
+    # we just need half_ranknet to predict a rank score, which can be used in sort documents
+    output = half_ranknet("left", feature_dim)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1)))
+    # load data of same query and relevance documents, need ranknet to rank these candidates
+    infer_query_id = []
+    infer_data = []
+    infer_doc_index = []
+    # convert to mq2007 built-in data format
+    # <query_id> <relevance_score> <feature_vector>
+    plain_txt_test = functools.partial(
+        paddle.dataset.mq2007.test, format="plain_txt")
+    for query_id, relevance_score, feature_vector in plain_txt_test():
+        infer_query_id.append(query_id)
+        infer_data.append(feature_vector)
+    # predict score of infer_data document. Re-sort the document base on predict score
+    # in descending order. then we build the ranking documents
+    scores = paddle.infer(
+        output_layer=output, parameters=parameters, input=infer_data)
+    for query_id, score in zip(infer_query_id, scores):
+        print "query_id : ", query_id, " ranknet rank document order : ", score
+if __name__ == '__main__':
+    paddle.init(use_gpu=False, trainer_count=4)
+    pass_num = 2
+    train_ranknet(pass_num)
+    ranknet_infer(pass_id=pass_num - 1)
--- a/nce_cost/.gitignore
+++ b/nce_cost/.gitignore
+*.pyc
+*.tar.gz
+models
--- a/nce_cost/README.md
+++ b/nce_cost/README.md
-TBD
+# 噪声对比估计加速词向量训练
+词向量是许多自然语言处理任务的基础，详细介绍可见 PaddleBook 中的[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)一节，其中通过训练神经概率语言模型（Neural Probabilistic Language Model, NPLM）得到词向量，是一种流行的方式。然而，神经概率语言模型的最后一层往往需要计算一个词典之上的概率分布，词典越大这一层的计算量也就越大，往往非常耗时。在models的另一篇我们已经介绍了[Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)，这里我们介绍另一种加速词向量训练的方法：使用噪声对比估计（Noise-contrastive estimation, NCE）损失函数\[[1](#参考文献)\]。
+## NCE
+NPLM 的最后一层 `softmax` 函数计算时需要考虑每个类别的指数项，必须计算字典中的所有单词，而在一般语料集上面字典往往非常大\[[3](#参考文献)\]，从而导致整个训练过程十分耗时。NCE 是一种快速对离散分布进行估计的方法。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比，NCE 不再使用复杂的二叉树来构造目标函数，而是采用相对简单的随机负采样，以大幅提升计算效率。
+假设已知具体的上下文 $h$，并且知道这个分布为 $P^h(w)$ ，并将从中抽样出来的数据作为正样例，而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布，默认为无偏的均匀分布。这里我们同时假设噪音样例 $k$ 倍于数据样例，则训练数据被抽中的概率为\[[1](#参考文献)\]：
+$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$
+其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ，$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量，整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]：
+$$
+J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) }  \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$
+$$
+ \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) }  \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h)))  \right]$$
+总体上来说，NCE 是通过构造逻辑回归（logistic regression），对正样例和负样例做二分类，对于每一个样本，将自身的预测词 label 作为正样例，同时采样出 $k$ 个其他词 label 作为负样例，从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 `softmax ` 分类需要计算每个类别的分数，然后归一化得到概率，节约了大量的计算时间。
+## 实验数据
+本文采用 Penn Treebank (PTB) 数据集（[Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)）来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据，如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作，方便后期处理。语料语种为英文，共有42068句训练数据，3761句测试数据。
+## 网络结构
+N-gram 神经概率语言模型详细网络结构见图1：
+<p align="center">
+<img src="images/network_conf.png" width = "70%" align="center"/><br/>
+图1. 网络配置结构
+</p>
+可以看到，模型主要分为如下几个部分构成：
+1. **输入层**：输入的 ptb 样本由原始的英文单词组成，将每个英文单词转换为字典中的 id 表示，使用唯一的 id 表示可以区分每个单词。
+2. **词向量层**：比起原先的 id 表示，词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵，将原先的 id 表示转换为固定维度的词向量表示。训练完成之后，词语之间的语义相似度可以使用词向量之间的距离来表示，语义越相似，距离越近。
+3. **词向量拼接层**：将词向量进行串联，并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。
+4. **全连接隐层**：将上一层获得的长向量输入到一层隐层的神经网络，输出特征向量。全连接的隐层可以增强网络的学习能力。
+5. **NCE层**：训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。
+## 训练
+在命令行窗口运行命令``` python train.py ```可以直接开启训练任务。
+- 程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集，如果未包含，则自动下载。
+- 运行过程中，每10个 batch 会打印模型训练在训练集上的代价值
+- 每个 pass 结束后，会计算测试数据集上的损失，并同时会保存最新的模型快照。
+在模型文件`network_conf.py`中 NCE 调用代码如下：
+```python
+cost = paddle.layer.nce(
+            input=hidden_layer,
+            label=next_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="nce_w"),
+            bias_attr=paddle.attr.Param(name="nce_b"),
+            act=paddle.activation.Sigmoid(),
+            num_neg_samples=25,
+            neg_distribution=None)
+```
+NCE 层的一些重要参数解释如下：
+| 参数名  | 参数作用  | 介绍 |
+|:------ |:-------| :--------|
+| param\_attr / bias\_attr | 用来设置参数名字 |方便预测阶段加载参数，具体在预测一节中介绍。|
+| num\_neg\_samples | 负样本采样个数|可以控制正负样本比例，这个值取值区间为 [1, 字典大小-1]，负样本个数越多则整个模型的训练速度越慢，模型精度也会越高 |
+| neg\_distribution | 生成负样例标签的分布，默认是一个均匀分布| 可以自行控制负样本采样时各个类别的采样权重。例如：希望正样例为“晴天”时，负样例“洪水”在训练时更被着重区分，则可以将“洪水”这个类别的采样权重增加|
+| act | 使用何种激活函数| 根据 NCE 的原理，这里应该使用 sigmoid 函数 |
+## 预测
+1. 首先修改 `infer.py` 脚本的 `main` 函数指定需要测试的模型。
+2. 需要注意的是，**预测和训练的计算逻辑不同**，需要以一个全连接层:`paddle.layer.fc`替换训练使用的`paddle.train.nce`层， 并直接加载NCE学习到的参数，代码如下：
+	```python
+	prediction = paddle.layer.fc(
+	    size=dict_size,
+	    act=paddle.activation.Softmax(),
+	    bias_attr=paddle.attr.Param(name="nce_b"),
+	    input=hidden_layer,
+	    param_attr=paddle.attr.Param(name="nce_w"))
+	```
+3. 运行 `python infer.py`。程序首先会加载指定的模型，然后按照 batch 大小依次进行预测，并打印预测结果。预测的输出格式如下：
+	```text
+	0.6734  their   may want to move
+	```
+	每一行是一条预测结果，内部以“\t”分隔，共计3列：
+	- 第一列：下一个词的概率。
+	- 第二列：模型预测的下一个词。
+	- 第三列：输入的 $n$ 个词语，内部以空格分隔。
+## 参考文献
+1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273.
+2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
+3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758.
--- a/nce_cost/images/network_conf.png
+++ b/nce_cost/images/network_conf.png
--- a/nce_cost/index.html
+++ b/nce_cost/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 噪声对比估计加速词向量训练
+词向量是许多自然语言处理任务的基础，详细介绍可见 PaddleBook 中的[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)一节，其中通过训练神经概率语言模型（Neural Probabilistic Language Model, NPLM）得到词向量，是一种流行的方式。然而，神经概率语言模型的最后一层往往需要计算一个词典之上的概率分布，词典越大这一层的计算量也就越大，往往非常耗时。在models的另一篇我们已经介绍了[Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)，这里我们介绍另一种加速词向量训练的方法：使用噪声对比估计（Noise-contrastive estimation, NCE）损失函数\[[1](#参考文献)\]。
+## NCE
+NPLM 的最后一层 `softmax` 函数计算时需要考虑每个类别的指数项，必须计算字典中的所有单词，而在一般语料集上面字典往往非常大\[[3](#参考文献)\]，从而导致整个训练过程十分耗时。NCE 是一种快速对离散分布进行估计的方法。与常用的 hierarchical-sigmoid \[[2](#参考文献)\] 方法相比，NCE 不再使用复杂的二叉树来构造目标函数，而是采用相对简单的随机负采样，以大幅提升计算效率。
+假设已知具体的上下文 $h$，并且知道这个分布为 $P^h(w)$ ，并将从中抽样出来的数据作为正样例，而从一个噪音分布 $P_n(w)$ 抽样的数据作为负样例。我们可以任意选择合适的噪音分布，默认为无偏的均匀分布。这里我们同时假设噪音样例 $k$ 倍于数据样例，则训练数据被抽中的概率为\[[1](#参考文献)\]：
+$$P^h(D=1|w,\theta)=\frac { P_\theta^h(w) }{ P^h_\theta(w)+kP_n(w) } =\sigma (\Delta s_\theta(w,h))$$
+其中 $\Delta s_\theta(w,h)=s_\theta(w,h)-\log (kP_n(w))$ ，$s_\theta(w,h)$ 表示选择在生成 $w$ 字并处于上下文 $h$ 时的特征向量，整体目标函数的目的就是增大正样本的概率同时降低负样本的概率。目标函数如下[[1](#参考文献)]：
+$$
+J^h(\theta )=E_{ P_d^h }\left[ \log { P^h(D=1|w,\theta ) }  \right] +kE_{ P_n }\left[ \log P^h (D=0|w,\theta ) \right]$$
+$$
+ \\\\\qquad =E_{ P_d^h }\left[ \log { \sigma (\Delta s_\theta(w,h)) }  \right] +kE_{ P_n }\left[ \log (1-\sigma (\Delta s_\theta(w,h)))  \right]$$
+总体上来说，NCE 是通过构造逻辑回归（logistic regression），对正样例和负样例做二分类，对于每一个样本，将自身的预测词 label 作为正样例，同时采样出 $k$ 个其他词 label 作为负样例，从而只需要计算样本在这 $k+1$ 个 label 上的概率。相比原始的 `softmax ` 分类需要计算每个类别的分数，然后归一化得到概率，节约了大量的计算时间。
+## 实验数据
+本文采用 Penn Treebank (PTB) 数据集（[Tomas Mikolov预处理版本](http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz)）来训练语言模型。PaddlePaddle 提供 [paddle.dataset.imikolov](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/imikolov.py) 接口来方便调用这些数据，如果当前目录没有找到数据它会自动下载并验证文件的完整性。并提供大小为5的滑动窗口对数据做预处理工作，方便后期处理。语料语种为英文，共有42068句训练数据，3761句测试数据。
+## 网络结构
+N-gram 神经概率语言模型详细网络结构见图1：
+<p align="center">
+<img src="images/network_conf.png" width = "70%" align="center"/><br/>
+图1. 网络配置结构
+</p>
+可以看到，模型主要分为如下几个部分构成：
+1. **输入层**：输入的 ptb 样本由原始的英文单词组成，将每个英文单词转换为字典中的 id 表示，使用唯一的 id 表示可以区分每个单词。
+2. **词向量层**：比起原先的 id 表示，词向量表示更能体现词与词之间的语义关系。这里使用可更新的 embedding 矩阵，将原先的 id 表示转换为固定维度的词向量表示。训练完成之后，词语之间的语义相似度可以使用词向量之间的距离来表示，语义越相似，距离越近。
+3. **词向量拼接层**：将词向量进行串联，并将词向量首尾相接形成一个长向量。这样可以方便后面全连接层的处理。
+4. **全连接隐层**：将上一层获得的长向量输入到一层隐层的神经网络，输出特征向量。全连接的隐层可以增强网络的学习能力。
+5. **NCE层**：训练时可以直接实用 PaddlePaddle 提供的 NCE Layer。
+## 训练
+在命令行窗口运行命令``` python train.py ```可以直接开启训练任务。
+- 程序第一次运行会检测用户缓存文件夹中是否包含 ptb 数据集，如果未包含，则自动下载。
+- 运行过程中，每10个 batch 会打印模型训练在训练集上的代价值
+- 每个 pass 结束后，会计算测试数据集上的损失，并同时会保存最新的模型快照。
+在模型文件`network_conf.py`中 NCE 调用代码如下：
+```python
+cost = paddle.layer.nce(
+            input=hidden_layer,
+            label=next_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="nce_w"),
+            bias_attr=paddle.attr.Param(name="nce_b"),
+            act=paddle.activation.Sigmoid(),
+            num_neg_samples=25,
+            neg_distribution=None)
+```
+NCE 层的一些重要参数解释如下：
+| 参数名  | 参数作用  | 介绍 |
+|:------ |:-------| :--------|
+| param\_attr / bias\_attr | 用来设置参数名字 |方便预测阶段加载参数，具体在预测一节中介绍。|
+| num\_neg\_samples | 负样本采样个数|可以控制正负样本比例，这个值取值区间为 [1, 字典大小-1]，负样本个数越多则整个模型的训练速度越慢，模型精度也会越高 |
+| neg\_distribution | 生成负样例标签的分布，默认是一个均匀分布| 可以自行控制负样本采样时各个类别的采样权重。例如：希望正样例为“晴天”时，负样例“洪水”在训练时更被着重区分，则可以将“洪水”这个类别的采样权重增加|
+| act | 使用何种激活函数| 根据 NCE 的原理，这里应该使用 sigmoid 函数 |
+## 预测
+1. 首先修改 `infer.py` 脚本的 `main` 函数指定需要测试的模型。
+2. 需要注意的是，**预测和训练的计算逻辑不同**，需要以一个全连接层:`paddle.layer.fc`替换训练使用的`paddle.train.nce`层， 并直接加载NCE学习到的参数，代码如下：
+	```python
+	prediction = paddle.layer.fc(
+	    size=dict_size,
+	    act=paddle.activation.Softmax(),
+	    bias_attr=paddle.attr.Param(name="nce_b"),
+	    input=hidden_layer,
+	    param_attr=paddle.attr.Param(name="nce_w"))
+	```
+3. 运行 `python infer.py`。程序首先会加载指定的模型，然后按照 batch 大小依次进行预测，并打印预测结果。预测的输出格式如下：
+	```text
+	0.6734  their   may want to move
+	```
+	每一行是一条预测结果，内部以“\t”分隔，共计3列：
+	- 第一列：下一个词的概率。
+	- 第二列：模型预测的下一个词。
+	- 第三列：输入的 $n$ 个词语，内部以空格分隔。
+## 参考文献
+1. Mnih A, Kavukcuoglu K. [Learning word embeddings efficiently with noise-contrastive estimation](https://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf)[C]//Advances in neural information processing systems. 2013: 2265-2273.
+2. Morin, F., & Bengio, Y. (2005, January). [Hierarchical Probabilistic Neural Network Language Model](http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf). In Aistats (Vol. 5, pp. 246-252).
+3. Mnih A, Teh Y W. [A Fast and Simple Algorithm for Training Neural Probabilistic Language Models](http://xueshu.baidu.com/s?wd=paperuri%3A%280735b97df93976efb333ac8c266a1eb2%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Farxiv.org%2Fabs%2F1206.6426&ie=utf-8&sc_us=5770715420073315630)[J]. Computer Science, 2012:1751-1758.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/nce_cost/infer.py
+++ b/nce_cost/infer.py
+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import os
+import gzip
+import numpy as np
+import paddle.v2 as paddle
+from network_conf import ngram_lm
+def infer_a_batch(inferer, test_batch, id_to_word):
+    probs = inferer.infer(input=test_batch)
+    for i, res in enumerate(zip(test_batch, probs)):
+        maxid = res[1].argsort()[-1]
+        print("%.4f\t%s\t%s" % (res[1][maxid], id_to_word[maxid],
+                                " ".join([id_to_word[w] for w in res[0]])))
+def infer(model_path, batch_size):
+    assert os.path.exists(model_path), "the trained model does not exist."
+    word_to_id = paddle.dataset.imikolov.build_dict()
+    id_to_word = dict((v, k) for k, v in word_to_id.items())
+    dict_size = len(word_to_id)
+    paddle.init(use_gpu=False, trainer_count=1)
+    # load the trained model.
+    with gzip.open(model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    prediction_layer = ngram_lm(
+        is_train=False, hidden_size=128, emb_size=512, dict_size=dict_size)
+    inferer = paddle.inference.Inference(
+        output_layer=prediction_layer, parameters=parameters)
+    test_batch = []
+    for idx, item in enumerate(paddle.dataset.imikolov.test(word_to_id, 5)()):
+        test_batch.append((item[:4]))
+        if len(test_batch) == batch_size:
+            infer_a_batch(inferer, test_batch, id_to_word)
+            infer_data = []
+    if len(test_batch):
+        infer_a_batch(inferer, test_batch, id_to_word)
+        infer_data = []
+        infer_data_label = []
+if __name__ == "__main__":
+    infer("models/model_pass_00000_00020.tar.gz", 10)
--- a/nce_cost/network_conf.py
+++ b/nce_cost/network_conf.py
+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import math
+import paddle.v2 as paddle
+def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
+    emb_layers = []
+    embed_param_attr = paddle.attr.Param(
+        name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)
+    for i in range(gram_num):
+        word = paddle.layer.data(
+            name="__word%02d__" % (i),
+            type=paddle.data_type.integer_value(dict_size))
+        emb_layers.append(
+            paddle.layer.embedding(
+                input=word, size=emb_size, param_attr=embed_param_attr))
+    next_word = paddle.layer.data(
+        name="__target_word__", type=paddle.data_type.integer_value(dict_size))
+    context_embedding = paddle.layer.concat(input=emb_layers)
+    hidden_layer = paddle.layer.fc(
+        input=context_embedding,
+        size=hidden_size,
+        act=paddle.activation.Tanh(),
+        param_attr=paddle.attr.Param(initial_std=1. / math.sqrt(emb_size * 8)))
+    if is_train:
+        cost = paddle.layer.nce(
+            input=hidden_layer,
+            label=next_word,
+            num_classes=dict_size,
+            param_attr=paddle.attr.Param(name="nce_w"),
+            bias_attr=paddle.attr.Param(name="nce_b"),
+            act=paddle.activation.Sigmoid(),
+            num_neg_samples=25,
+            neg_distribution=None)
+        return cost
+    else:
+        prediction = paddle.layer.fc(
+            size=dict_size,
+            act=paddle.activation.Softmax(),
+            bias_attr=paddle.attr.Param(name="nce_b"),
+            input=hidden_layer,
+            param_attr=paddle.attr.Param(name="nce_w"))
+        return prediction
--- a/nce_cost/train.py
+++ b/nce_cost/train.py
+#!/usr/bin/env python
+# -*- encoding:utf-8 -*-
+import os
+import logging
+import gzip
+import paddle.v2 as paddle
+from network_conf import ngram_lm
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def train(model_save_dir):
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
+    cost = ngram_lm(hidden_size=128, emb_size=512, dict_size=dict_size)
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(cost, parameters, optimizer)
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id and not event.batch_id % 10:
+                logger.info("Pass %d, Batch %d, Cost %f" %
+                            (event.pass_id, event.batch_id, event.cost))
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                paddle.batch(paddle.dataset.imikolov.test(word_dict, 5), 64))
+            logger.info("Test Pass %d, Cost %f" % (event.pass_id, result.cost))
+            save_path = os.path.join(model_save_dir,
+                                     "model_pass_%05d.tar.gz" % event.pass_id)
+            logger.info("Save model into %s ..." % save_path)
+            with gzip.open(save_path, "w") as f:
+                parameters.to_tar(f)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64),
+        num_passes=1000,
+        event_handler=event_handler)
+if __name__ == "__main__":
+    train(model_save_dir="models")
--- a/nested_sequence/README.md
+++ b/nested_sequence/README.md
-TBD
--- a/nmt_without_attention/README.md
+++ b/nmt_without_attention/README.md
+# 神经网络机器翻译模型
+## 背景介绍
+机器翻译利用计算机将源语言转换成目标语言的同义表达，是自然语言处理中重要的研究方向，有着广泛的应用需求，其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型，需要人为地指定翻译规则或设计语言特征，效果依赖于人对源语言与目标语言的理解程度。近些年来，深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功，进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射，大为减少了学习过程中人的介入，同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络（Recurrent Neural Network, RNN）构建一个端到端（End-to-End）的神经网络机器翻译（Neural Machine Translation, NMT）模型。
+## 模型概览
+基于 RNN 的神经网络机器翻译模型遵循编码器－解码器结构，其中的编码器和解码器均是一个循环神经网络。将构成编码器和解码器的两个 RNN 沿时间步展开，得到如下的模型结构图：
+<p align="center"><img src="images/encoder-decoder.png" width = "90%" align="center"/><br/>图 1. 编码器－解码器框架 </p>
+神经机器翻译模型的输入输出可以是字符，也可以是词或者短语。不失一般性，本例以基于词的模型为例说明编码器／解码器的工作机制：
+- **编码器**：将源语言句子编码成一个向量，作为解码器的输入。解码器的原始输入是表示词的 `id` 序列 $w = {w_1, w_2, ..., w_T}$，用独热（One-hot）码表示。为了对输入进行降维，同时建立词语之间的语义关联，模型为热独码表示的单词学习一个词嵌入（Word Embedding）表示，也就是常说的词向量，关于词向量的详细介绍请参考 PaddleBook 的[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)一章。最后 RNN 单元逐个词地处理输入，得到完整句子的编码向量。
+- **解码器**：接受编码器的输入，逐个词地解码出目标语言序列 $u = {u_1, u_2, ..., u_{T'}}$。每个时间步，RNN 单元输出一个隐藏向量，之后经 `Softmax` 归一化计算出下一个目标词的条件概率，即 $P(u_i | w, u_1, u_2, ..., u_{t-1})$。因此，给定输入 $w$，其对应的翻译结果为 $u$ 的概率则为
+$$ P(u_1,u_2,...,u_{T'} | w) = \prod_{t=1}^{t={T'}}p(u_t|w, u_1, u_2, u_{t-1})$$
+以中文到英文的翻译为例，源语言是中文，目标语言是英文。下面是一句源语言分词后的句子
+```
+祝愿 祖国 繁荣 昌盛
+```
+对应的目标语言英文翻译结果为：
+```
+Wish motherland rich and powerful
+```
+在预处理阶段，准备源语言与目标语言互译的平行语料数据，并分别构建源语言和目标语言的词典；在训练阶段，用这样成对的平行语料训练模型；在模型测试阶段，输入中文句子，模型自动生成对应的英语翻译，然后将生成结果与标准翻译对比进行评估。在机器翻译领域，BLEU 是最流行的自动评估指标之一。
+### RNN 单元
+RNN 的原始结构用一个向量来存储隐状态，然而这种结构的 RNN 在训练时容易发生梯度弥散（gradient vanishing），对于长时间的依赖关系难以建模。因此人们对 RNN 单元进行了改进，提出了 LSTM\[[1](#参考文献)] 和 GRU\[[2](#参考文献)]，这两种单元以门来控制应该记住的和遗忘的信息，较好地解决了序列数据的长时依赖问题。以本例所用的 GRU 为例，其基本结构如下：
+<p align="center">
+<img src="images/gru.png" width = "90%" align="center"/><br/>
+图 2. GRU 单元
+ </p>
+可以看到除了隐含状态以外，GRU 内部还包含了两个门：更新门(Update Gate)、重置门(Reset Gate)。在每一个时间步，门限和隐状态的更新由图 2 右侧的公式决定。这两个门限决定了状态以何种方式更新。
+### 双向编码器
+在上述的基本模型中，编码器在顺序处理输入句子序列时，当前时刻的状态只包含了历史输入信息，而没有未来时刻的序列信息。而对于序列建模，未来时刻的上下文同样包含了重要的信息。可以使用如图 3 所示的这种双向编码器来同时获取当前时刻输入的上下文：
+<p align="center">
+<img src="images/bidirectional-encoder.png" width = "90%" align="center"/><br/>
+图 3. 双向编码器结构示意图
+ </p>
+图 3 所示的双向编码器\[[3](#参考文献)\]由两个独立的 RNN 构成，分别从前向和后向对输入序列进行编码，然后将两个 RNN 的输出合并在一起，作为最终的编码输出。
+在 PaddlePaddle 中，双向编码器可以很方便地调用相关 APIs 实现：
+```python
+src_word_id = paddle.layer.data(
+    name='source_language_word',
+    type=paddle.data_type.integer_value_sequence(source_dict_dim))
+# source embedding
+src_embedding = paddle.layer.embedding(
+    input=src_word_id, size=word_vector_dim)
+# bidirectional GRU as encoder
+encoded_vector = paddle.networks.bidirectional_gru(
+    input=src_embedding,
+    size=encoder_size,
+    fwd_act=paddle.activation.Tanh(),
+    fwd_gate_act=paddle.activation.Sigmoid(),
+    bwd_act=paddle.activation.Tanh(),
+    bwd_gate_act=paddle.activation.Sigmoid(),
+    return_seq=True)
+```
+### 柱搜索（Beam Search） 算法
+训练完成后的生成阶段，模型根据源语言输入，解码生成对应的目标语言翻译结果。解码时，一个直接的方式是取每一步条件概率最大的词，作为当前时刻的输出。但局部最优并不一定能得到全局最优，即这种做法并不能保证最后得到的完整句子出现的概率最大。如果对解的全空间进行搜索，其代价又过大。为了解决这个问题，通常采用柱搜索（Beam Search）算法。柱搜索是一种启发式的图搜索算法，用一个参数 $k$ 控制搜索宽度，其要点如下：
+**1**. 在解码的过程中，始终维护 $k$ 个已解码出的子序列；
+**2**. 在中间时刻 $t$, 对于 $k$ 个子序列中的每个序列，计算下一个词出现的概率并取概率最大的前 $k$ 个词，组合得到 $k^2$ 个新子序列；
+**3**. 取 **2** 中这些组合序列中概率最大的前 $k$ 个以更新原来的子序列;
+**4**. 不断迭代下去，直至得到 $k$ 个完整的句子，作为翻译结果的候选。
+关于柱搜索的更多介绍，可以参考 PaddleBook 中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)一章中[柱搜索](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)一节。
+### 无注意力机制的解码器
+- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中，已介绍了带注意力机制（Attention Mechanism）的 Encoder-Decoder 结构，本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制，读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
+对于流行的RNN单元，PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作，可使用 PaddlePaddle 中的`recurrent_layer_group`。首先，自定义单步逻辑函数，再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现，其中，单步逻辑函数`gru_decoder_without_attention()`相关代码如下：
+```python
+# the initialization state for decoder GRU
+encoder_last = paddle.layer.last_seq(input=encoded_vector)
+encoder_last_projected = paddle.layer.fc(
+    size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
+# the step function for decoder GRU
+def gru_decoder_without_attention(enc_vec, current_word):
+    '''
+    Step function for gru decoder
+    :param enc_vec: encoded vector of source language
+    :type enc_vec: layer object
+    :param current_word: current input of decoder
+    :type current_word: layer object
+    '''
+    decoder_mem = paddle.layer.memory(
+            name="gru_decoder",
+            size=decoder_size,
+            boot_layer=encoder_last_projected)
+    context = paddle.layer.last_seq(input=enc_vec)
+    decoder_inputs = paddle.layer.fc(
+        size=decoder_size * 3, input=[context, current_word])
+    gru_step = paddle.layer.gru_step(
+        name="gru_decoder",
+        act=paddle.activation.Tanh(),
+        gate_act=paddle.activation.Sigmoid(),
+        input=decoder_inputs,
+        output_mem=decoder_mem,
+        size=decoder_size)
+     out = paddle.layer.fc(
+        size=target_dict_dim,
+        bias_attr=True,
+        act=paddle.activation.Softmax(),
+        input=gru_step)
+    return out  
+```
+在模型训练和测试阶段，解码器的行为有很大的不同：
+- **训练阶段**：目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`，函数`recurrent_group()`循环调用单步逻辑执行，最后计算目标翻译与实际解码的差异cost并返回；
+- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+训练和生成的逻辑分别实现在如下的`if-else`条件分支中：
+```python
+group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+group_inputs = [group_input1]
+decoder_group_name = "decoder_group"
+if is_generating:
+    trg_embedding = paddle.layer.GeneratedInput(
+        size=target_dict_dim,
+        embedding_name="_target_language_embedding",
+        embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+        name=decoder_group_name,
+        step=gru_decoder_without_attention,
+        input=group_inputs,
+        bos_id=0,
+        eos_id=1,
+        beam_size=beam_size,
+        max_length=max_length)
+    return beam_gen
+else:
+    trg_embedding = paddle.layer.embedding(
+        input=paddle.layer.data(
+            name="target_language_word",
+            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
+    group_inputs.append(trg_embedding)
+    decoder = paddle.layer.recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_without_attention,
+        input=group_inputs)
+    lbl = paddle.layer.data(
+        name="target_language_next_word",
+        type=paddle.data_type.integer_value_sequence(target_dict_dim))
+    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+    return cost
+```
+## 数据准备
+本例所用到的数据来自[WMT14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)，该数据集是法文到英文互译的平行语料。用[bitexts](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练数据，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为验证与测试数据。在PaddlePaddle中已经封装好了该数据集的读取接口，在首次运行的时候，程序会自动完成下载，用户无需手动完成相关的数据准备。
+## 模型的训练与测试
+### 模型训练
+启动模型训练的十分简单，只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑：
+**a) 由网络定义，解析网络结构，初始化模型参数**
+```python
+# define the network topolgy.
+cost = seq2seq_net(source_dict_dim, target_dict_dim)
+parameters = paddle.parameters.create(cost)
+```
+**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
+```python
+# define optimization method
+optimizer = paddle.optimizer.RMSProp(
+    learning_rate=1e-3,
+    gradient_clipping_threshold=10.0,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+# define the trainer instance
+trainer = paddle.trainer.SGD(
+    cost=cost, parameters=parameters, update_equation=optimizer)
+# define data reader
+wmt14_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192),
+    batch_size=55)
+```
+**c) 定义事件句柄，打印训练中间结果、保存模型快照**
+```python
+# define the event_handler callback
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if not event.batch_id % 100 and event.batch_id:
+            with gzip.open(
+                    os.path.join(save_path,
+                                 "nmt_without_att_%05d_batch_%05d.tar.gz" %
+                                 event.pass_id, event.batch_id), "w") as f:
+                parameters.to_tar(f)
+        if event.batch_id and not event.batch_id % 10:
+            logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics))
+```
+**d) 开始训练**
+```python
+# start training
+trainer.train(
+    reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+```
+输出样例为
+```text
+Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0}
+.........
+Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498}
+.........
+Pass 0, Batch 20, Cost 177.989329, {'classification_error_evaluator': 0.9052488207817078}
+.........
+Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.8643803596496582}
+.........
+Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973}
+```
+### 生成翻译结果
+利用训练好的模型生成翻译文本也十分简单。
+1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数，以选择使用哪一个保存的模型来生成。默认参数如下所示：
+    ```python
+    generate(
+        source_dict_dim=30000,
+        target_dict_dim=30000,
+        batch_size=20,
+        beam_size=3,
+        model_path="models/nmt_without_att_params_batch_00100.tar.gz")
+    ```
+2. 在终端执行命令 `python generate.py`，脚本中的`generate()`执行了依次如下逻辑：
+    **a) 加载测试样本**
+    ```python
+    # load data  samples for generation
+    gen_creator = paddle.dataset.wmt14.gen(source_dict_dim)
+    gen_data = []
+    for item in gen_creator():
+        gen_data.append((item[0], ))
+    ```
+    **b) 初始化模型，执行`infer()`为每个输入样本生成`beam search`的翻译结果**
+    ```python
+    beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
+    with gzip.open(init_models_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # prob is the prediction probabilities, and id is the prediction word.
+    beam_result = paddle.infer(
+        output_layer=beam_gen,
+        parameters=parameters,
+        input=gen_data,
+        field=['prob', 'id'])
+    ```
+    **c) 加载源语言和目标语言词典，将`id`序列表示的句子转化成原语言并输出结果**
+    ```python
+    beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
+    gen_sen_idx = np.where(beam_result[1] == -1)[0]
+    assert len(gen_sen_idx) == len(test_batch) * beam_size
+    start_pos, end_pos = 1, 0
+    for i, sample in enumerate(test_batch):
+        print(" ".join([
+            src_dict[w] for w in sample[0][1:-1]
+        ]))  # skip the start and ending mark when print the source sentence
+        for j in xrange(beam_size):
+            end_pos = gen_sen_idx[i * beam_size + j]
+            print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+            start_pos = end_pos + 2
+        print("\n")
+    ```
+设置beam search的宽度为3，输入为一个法文句子，则自动为测试数据生成对应的翻译结果，输出格式如下：
+```text
+Elles connaissent leur entreprise mieux que personne .
+-3.754819        They know their business better than anyone . <e>
+-4.445528        They know their businesses better than anyone . <e>
+-5.026885        They know their business better than anybody . <e>
+```
+- 第一行为输入的源语言句子。
+- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果
+    - 相同行的输出以“\t”分隔为两列，第一列是句子的log 概率，第二列是翻译结果的文本。
+    - 符号`<s>` 表示句子的开始，符号`<e>`表示一个句子的结束，如果出现了在词典中未包含的词，则用符号`<unk>`替代。
+至此，我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到，PaddlePaddle 提供了灵活丰富的API供大家选择和使用，使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域，各种新方法新思想在不断涌现。在学习完本例后，读者若有兴趣和余力，可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
+## 参考文献
+[1] Sutskever I, Vinyals O, Le Q V. [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)[J]. 2014, 4:3104-3112.
+[2]Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+[3] Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]. Proceedings of ICLR 2015, 2015
--- a/nmt_without_attention/generate.py
+++ b/nmt_without_attention/generate.py
+#!/usr/bin/env python
+import os
+import logging
+import numpy as np
+from network_conf import seq2seq_net
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.WARNING)
+def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict):
+    beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
+    gen_sen_idx = np.where(beam_result[1] == -1)[0]
+    assert len(gen_sen_idx) == len(test_batch) * beam_size
+    start_pos, end_pos = 1, 0
+    for i, sample in enumerate(test_batch):
+        print(" ".join([
+            src_dict[w] for w in sample[0][1:-1]
+        ]))  # skip the start and ending mark when print the source sentence
+        for j in xrange(beam_size):
+            end_pos = gen_sen_idx[i * beam_size + j]
+            print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+            start_pos = end_pos + 2
+        print("\n")
+def generate(source_dict_dim, target_dict_dim, model_path, beam_size,
+             batch_size):
+    """
+    Sequence generation for NMT.
+    :param source_dict_dim: size of source dictionary
+    :type source_dict_dim: int
+    :param target_dict_dim: size of target dictionary
+    :type target_dict_dim: int
+    :param model_path: path for inital model
+    :type model_path: string
+    :param beam_size: the expanson width in each generation setp
+    :param beam_size: int
+    :param batch_size: the number of training examples in one forward pass
+    :param batch_size: int
+    """
+    assert os.path.exists(model_path), "trained model does not exist."
+    # step 1: prepare dictionary
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
+    # step 2: load the trained model
+    paddle.init(use_gpu=False, trainer_count=1)
+    with gzip.open(model_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    beam_gen = seq2seq_net(
+        source_dict_dim,
+        target_dict_dim,
+        beam_size=beam_size,
+        max_length=100,
+        is_generating=True)
+    inferer = paddle.inference.Inference(
+        output_layer=beam_gen, parameters=parameters)
+    # step 3: iterating over the testing dataset
+    test_batch = []
+    for idx, item in enumerate(paddle.dataset.wmt14.gen(source_dict_dim)()):
+        test_batch.append([item[0]])
+        if len(test_batch) == batch_size:
+            infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict)
+            test_batch = []
+    if len(test_batch):
+        infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict)
+        test_batch = []
+if __name__ == "__main__":
+    generate(
+        source_dict_dim=30000,
+        target_dict_dim=30000,
+        batch_size=20,
+        beam_size=3,
+        model_path="models/nmt_without_att_params_batch_00100.tar.gz")
--- a/nmt_without_attention/images/bidirectional-encoder.png
+++ b/nmt_without_attention/images/bidirectional-encoder.png
--- a/nmt_without_attention/images/encoder-decoder.png
+++ b/nmt_without_attention/images/encoder-decoder.png
--- a/nmt_without_attention/images/gru.png
+++ b/nmt_without_attention/images/gru.png
--- a/nmt_without_attention/index.html
+++ b/nmt_without_attention/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 神经网络机器翻译模型
+## 背景介绍
+机器翻译利用计算机将源语言转换成目标语言的同义表达，是自然语言处理中重要的研究方向，有着广泛的应用需求，其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型，需要人为地指定翻译规则或设计语言特征，效果依赖于人对源语言与目标语言的理解程度。近些年来，深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功，进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射，大为减少了学习过程中人的介入，同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络（Recurrent Neural Network, RNN）构建一个端到端（End-to-End）的神经网络机器翻译（Neural Machine Translation, NMT）模型。
+## 模型概览
+基于 RNN 的神经网络机器翻译模型遵循编码器－解码器结构，其中的编码器和解码器均是一个循环神经网络。将构成编码器和解码器的两个 RNN 沿时间步展开，得到如下的模型结构图：
+<p align="center"><img src="images/encoder-decoder.png" width = "90%" align="center"/><br/>图 1. 编码器－解码器框架 </p>
+神经机器翻译模型的输入输出可以是字符，也可以是词或者短语。不失一般性，本例以基于词的模型为例说明编码器／解码器的工作机制：
+- **编码器**：将源语言句子编码成一个向量，作为解码器的输入。解码器的原始输入是表示词的 `id` 序列 $w = {w_1, w_2, ..., w_T}$，用独热（One-hot）码表示。为了对输入进行降维，同时建立词语之间的语义关联，模型为热独码表示的单词学习一个词嵌入（Word Embedding）表示，也就是常说的词向量，关于词向量的详细介绍请参考 PaddleBook 的[词向量](https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/README.cn.md)一章。最后 RNN 单元逐个词地处理输入，得到完整句子的编码向量。
+- **解码器**：接受编码器的输入，逐个词地解码出目标语言序列 $u = {u_1, u_2, ..., u_{T'}}$。每个时间步，RNN 单元输出一个隐藏向量，之后经 `Softmax` 归一化计算出下一个目标词的条件概率，即 $P(u_i | w, u_1, u_2, ..., u_{t-1})$。因此，给定输入 $w$，其对应的翻译结果为 $u$ 的概率则为
+$$ P(u_1,u_2,...,u_{T'} | w) = \prod_{t=1}^{t={T'}}p(u_t|w, u_1, u_2, u_{t-1})$$
+以中文到英文的翻译为例，源语言是中文，目标语言是英文。下面是一句源语言分词后的句子
+```
+祝愿 祖国 繁荣 昌盛
+```
+对应的目标语言英文翻译结果为：
+```
+Wish motherland rich and powerful
+```
+在预处理阶段，准备源语言与目标语言互译的平行语料数据，并分别构建源语言和目标语言的词典；在训练阶段，用这样成对的平行语料训练模型；在模型测试阶段，输入中文句子，模型自动生成对应的英语翻译，然后将生成结果与标准翻译对比进行评估。在机器翻译领域，BLEU 是最流行的自动评估指标之一。
+### RNN 单元
+RNN 的原始结构用一个向量来存储隐状态，然而这种结构的 RNN 在训练时容易发生梯度弥散（gradient vanishing），对于长时间的依赖关系难以建模。因此人们对 RNN 单元进行了改进，提出了 LSTM\[[1](#参考文献)] 和 GRU\[[2](#参考文献)]，这两种单元以门来控制应该记住的和遗忘的信息，较好地解决了序列数据的长时依赖问题。以本例所用的 GRU 为例，其基本结构如下：
+<p align="center">
+<img src="images/gru.png" width = "90%" align="center"/><br/>
+图 2. GRU 单元
+ </p>
+可以看到除了隐含状态以外，GRU 内部还包含了两个门：更新门(Update Gate)、重置门(Reset Gate)。在每一个时间步，门限和隐状态的更新由图 2 右侧的公式决定。这两个门限决定了状态以何种方式更新。
+### 双向编码器
+在上述的基本模型中，编码器在顺序处理输入句子序列时，当前时刻的状态只包含了历史输入信息，而没有未来时刻的序列信息。而对于序列建模，未来时刻的上下文同样包含了重要的信息。可以使用如图 3 所示的这种双向编码器来同时获取当前时刻输入的上下文：
+<p align="center">
+<img src="images/bidirectional-encoder.png" width = "90%" align="center"/><br/>
+图 3. 双向编码器结构示意图
+ </p>
+图 3 所示的双向编码器\[[3](#参考文献)\]由两个独立的 RNN 构成，分别从前向和后向对输入序列进行编码，然后将两个 RNN 的输出合并在一起，作为最终的编码输出。
+在 PaddlePaddle 中，双向编码器可以很方便地调用相关 APIs 实现：
+```python
+src_word_id = paddle.layer.data(
+    name='source_language_word',
+    type=paddle.data_type.integer_value_sequence(source_dict_dim))
+# source embedding
+src_embedding = paddle.layer.embedding(
+    input=src_word_id, size=word_vector_dim)
+# bidirectional GRU as encoder
+encoded_vector = paddle.networks.bidirectional_gru(
+    input=src_embedding,
+    size=encoder_size,
+    fwd_act=paddle.activation.Tanh(),
+    fwd_gate_act=paddle.activation.Sigmoid(),
+    bwd_act=paddle.activation.Tanh(),
+    bwd_gate_act=paddle.activation.Sigmoid(),
+    return_seq=True)
+```
+### 柱搜索（Beam Search） 算法
+训练完成后的生成阶段，模型根据源语言输入，解码生成对应的目标语言翻译结果。解码时，一个直接的方式是取每一步条件概率最大的词，作为当前时刻的输出。但局部最优并不一定能得到全局最优，即这种做法并不能保证最后得到的完整句子出现的概率最大。如果对解的全空间进行搜索，其代价又过大。为了解决这个问题，通常采用柱搜索（Beam Search）算法。柱搜索是一种启发式的图搜索算法，用一个参数 $k$ 控制搜索宽度，其要点如下：
+**1**. 在解码的过程中，始终维护 $k$ 个已解码出的子序列；
+**2**. 在中间时刻 $t$, 对于 $k$ 个子序列中的每个序列，计算下一个词出现的概率并取概率最大的前 $k$ 个词，组合得到 $k^2$ 个新子序列；
+**3**. 取 **2** 中这些组合序列中概率最大的前 $k$ 个以更新原来的子序列;
+**4**. 不断迭代下去，直至得到 $k$ 个完整的句子，作为翻译结果的候选。
+关于柱搜索的更多介绍，可以参考 PaddleBook 中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)一章中[柱搜索](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)一节。
+### 无注意力机制的解码器
+- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中，已介绍了带注意力机制（Attention Mechanism）的 Encoder-Decoder 结构，本例介绍的则是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制，读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
+对于流行的RNN单元，PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作，可使用 PaddlePaddle 中的`recurrent_layer_group`。首先，自定义单步逻辑函数，再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现，其中，单步逻辑函数`gru_decoder_without_attention()`相关代码如下：
+```python
+# the initialization state for decoder GRU
+encoder_last = paddle.layer.last_seq(input=encoded_vector)
+encoder_last_projected = paddle.layer.fc(
+    size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
+# the step function for decoder GRU
+def gru_decoder_without_attention(enc_vec, current_word):
+    '''
+    Step function for gru decoder
+    :param enc_vec: encoded vector of source language
+    :type enc_vec: layer object
+    :param current_word: current input of decoder
+    :type current_word: layer object
+    '''
+    decoder_mem = paddle.layer.memory(
+            name="gru_decoder",
+            size=decoder_size,
+            boot_layer=encoder_last_projected)
+    context = paddle.layer.last_seq(input=enc_vec)
+    decoder_inputs = paddle.layer.fc(
+        size=decoder_size * 3, input=[context, current_word])
+    gru_step = paddle.layer.gru_step(
+        name="gru_decoder",
+        act=paddle.activation.Tanh(),
+        gate_act=paddle.activation.Sigmoid(),
+        input=decoder_inputs,
+        output_mem=decoder_mem,
+        size=decoder_size)
+     out = paddle.layer.fc(
+        size=target_dict_dim,
+        bias_attr=True,
+        act=paddle.activation.Softmax(),
+        input=gru_step)
+    return out  
+```
+在模型训练和测试阶段，解码器的行为有很大的不同：
+- **训练阶段**：目标翻译结果的词向量`trg_embedding`作为参数传递给单步逻辑`gru_decoder_without_attention()`，函数`recurrent_group()`循环调用单步逻辑执行，最后计算目标翻译与实际解码的差异cost并返回；
+- **测试阶段**：解码器根据最后一个生成的词预测下一个词，`GeneratedInput()`自动取出模型预测出的概率最高的$k$个词的词向量传递给单步逻辑，`beam_search()`函数调用单步逻辑函数`gru_decoder_without_attention()`完成柱搜索并作为结果返回。
+训练和生成的逻辑分别实现在如下的`if-else`条件分支中：
+```python
+group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+group_inputs = [group_input1]
+decoder_group_name = "decoder_group"
+if is_generating:
+    trg_embedding = paddle.layer.GeneratedInput(
+        size=target_dict_dim,
+        embedding_name="_target_language_embedding",
+        embedding_size=word_vector_dim)
+    group_inputs.append(trg_embedding)
+    beam_gen = paddle.layer.beam_search(
+        name=decoder_group_name,
+        step=gru_decoder_without_attention,
+        input=group_inputs,
+        bos_id=0,
+        eos_id=1,
+        beam_size=beam_size,
+        max_length=max_length)
+    return beam_gen
+else:
+    trg_embedding = paddle.layer.embedding(
+        input=paddle.layer.data(
+            name="target_language_word",
+            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
+    group_inputs.append(trg_embedding)
+    decoder = paddle.layer.recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_without_attention,
+        input=group_inputs)
+    lbl = paddle.layer.data(
+        name="target_language_next_word",
+        type=paddle.data_type.integer_value_sequence(target_dict_dim))
+    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+    return cost
+```
+## 数据准备
+本例所用到的数据来自[WMT14](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/)，该数据集是法文到英文互译的平行语料。用[bitexts](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)作为训练数据，[dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)作为验证与测试数据。在PaddlePaddle中已经封装好了该数据集的读取接口，在首次运行的时候，程序会自动完成下载，用户无需手动完成相关的数据准备。
+## 模型的训练与测试
+### 模型训练
+启动模型训练的十分简单，只需在命令行窗口中执行`python train.py`。模型训练阶段 `train.py` 脚本中的 `train()` 函数依次完成了如下的逻辑：
+**a) 由网络定义，解析网络结构，初始化模型参数**
+```python
+# define the network topolgy.
+cost = seq2seq_net(source_dict_dim, target_dict_dim)
+parameters = paddle.parameters.create(cost)
+```
+**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
+```python
+# define optimization method
+optimizer = paddle.optimizer.RMSProp(
+    learning_rate=1e-3,
+    gradient_clipping_threshold=10.0,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+# define the trainer instance
+trainer = paddle.trainer.SGD(
+    cost=cost, parameters=parameters, update_equation=optimizer)
+# define data reader
+wmt14_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192),
+    batch_size=55)
+```
+**c) 定义事件句柄，打印训练中间结果、保存模型快照**
+```python
+# define the event_handler callback
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if not event.batch_id % 100 and event.batch_id:
+            with gzip.open(
+                    os.path.join(save_path,
+                                 "nmt_without_att_%05d_batch_%05d.tar.gz" %
+                                 event.pass_id, event.batch_id), "w") as f:
+                parameters.to_tar(f)
+        if event.batch_id and not event.batch_id % 10:
+            logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                event.pass_id, event.batch_id, event.cost, event.metrics))
+```
+**d) 开始训练**
+```python
+# start training
+trainer.train(
+    reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+```
+输出样例为
+```text
+Pass 0, Batch 0, Cost 267.674663, {'classification_error_evaluator': 1.0}
+.........
+Pass 0, Batch 10, Cost 172.892294, {'classification_error_evaluator': 0.953895092010498}
+.........
+Pass 0, Batch 20, Cost 177.989329, {'classification_error_evaluator': 0.9052488207817078}
+.........
+Pass 0, Batch 30, Cost 153.633665, {'classification_error_evaluator': 0.8643803596496582}
+.........
+Pass 0, Batch 40, Cost 168.170543, {'classification_error_evaluator': 0.8348183631896973}
+```
+### 生成翻译结果
+利用训练好的模型生成翻译文本也十分简单。
+1. 首先请修改`generate.py`脚本中`main`中传递给`generate`函数的参数，以选择使用哪一个保存的模型来生成。默认参数如下所示：
+    ```python
+    generate(
+        source_dict_dim=30000,
+        target_dict_dim=30000,
+        batch_size=20,
+        beam_size=3,
+        model_path="models/nmt_without_att_params_batch_00100.tar.gz")
+    ```
+2. 在终端执行命令 `python generate.py`，脚本中的`generate()`执行了依次如下逻辑：
+    **a) 加载测试样本**
+    ```python
+    # load data  samples for generation
+    gen_creator = paddle.dataset.wmt14.gen(source_dict_dim)
+    gen_data = []
+    for item in gen_creator():
+        gen_data.append((item[0], ))
+    ```
+    **b) 初始化模型，执行`infer()`为每个输入样本生成`beam search`的翻译结果**
+    ```python
+    beam_gen = seq2seq_net(source_dict_dim, target_dict_dim, True)
+    with gzip.open(init_models_path) as f:
+        parameters = paddle.parameters.Parameters.from_tar(f)
+    # prob is the prediction probabilities, and id is the prediction word.
+    beam_result = paddle.infer(
+        output_layer=beam_gen,
+        parameters=parameters,
+        input=gen_data,
+        field=['prob', 'id'])
+    ```
+    **c) 加载源语言和目标语言词典，将`id`序列表示的句子转化成原语言并输出结果**
+    ```python
+    beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
+    gen_sen_idx = np.where(beam_result[1] == -1)[0]
+    assert len(gen_sen_idx) == len(test_batch) * beam_size
+    start_pos, end_pos = 1, 0
+    for i, sample in enumerate(test_batch):
+        print(" ".join([
+            src_dict[w] for w in sample[0][1:-1]
+        ]))  # skip the start and ending mark when print the source sentence
+        for j in xrange(beam_size):
+            end_pos = gen_sen_idx[i * beam_size + j]
+            print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
+                trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
+            start_pos = end_pos + 2
+        print("\n")
+    ```
+设置beam search的宽度为3，输入为一个法文句子，则自动为测试数据生成对应的翻译结果，输出格式如下：
+```text
+Elles connaissent leur entreprise mieux que personne .
+-3.754819        They know their business better than anyone . <e>
+-4.445528        They know their businesses better than anyone . <e>
+-5.026885        They know their business better than anybody . <e>
+```
+- 第一行为输入的源语言句子。
+- 第二 ~ beam_size + 1 行是柱搜索生成的 `beam_size` 条翻译结果
+    - 相同行的输出以“\t”分隔为两列，第一列是句子的log 概率，第二列是翻译结果的文本。
+    - 符号`<s>` 表示句子的开始，符号`<e>`表示一个句子的结束，如果出现了在词典中未包含的词，则用符号`<unk>`替代。
+至此，我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到，PaddlePaddle 提供了灵活丰富的API供大家选择和使用，使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域，各种新方法新思想在不断涌现。在学习完本例后，读者若有兴趣和余力，可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
+## 参考文献
+[1] Sutskever I, Vinyals O, Le Q V. [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215)[J]. 2014, 4:3104-3112.
+[2]Cho K, Van Merriënboer B, Gulcehre C, et al. [Learning phrase representations using RNN encoder-decoder for statistical machine translation](http://www.aclweb.org/anthology/D/D14/D14-1179.pdf)[C]. Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), 2014: 1724-1734.
+[3] Bahdanau D, Cho K, Bengio Y. [Neural machine translation by jointly learning to align and translate](https://arxiv.org/abs/1409.0473)[C]. Proceedings of ICLR 2015, 2015
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/nmt_without_attention/network_conf.py
+++ b/nmt_without_attention/network_conf.py
+#!/usr/bin/env python
+import paddle.v2 as paddle
+import sys
+import gzip
+def seq2seq_net(source_dict_dim,
+                target_dict_dim,
+                word_vector_dim=620,
+                rnn_hidden_size=1000,
+                beam_size=1,
+                max_length=50,
+                is_generating=False):
+    """
+    Define the network structure of NMT, including encoder and decoder.
+    :param source_dict_dim: size of source dictionary
+    :type source_dict_dim : int
+    :param target_dict_dim: size of target dictionary
+    :type target_dict_dim: int
+    :param word_vector_dim: size of source language word embedding
+    :type word_vector_dim: int
+    :param rnn_hidden_size: size of hidden state of encoder and decoder RNN
+    :type rnn_hidden_size: int
+    :param beam_size: expansion width in each step when generating
+    :type beam_size: int
+    :param max_length: max iteration number in generation
+    :type max_length: int
+    :param generating: whether to generate sequence or to train
+    :type generating: bool
+    """
+    decoder_size = encoder_size = rnn_hidden_size
+    src_word_id = paddle.layer.data(
+        name="source_language_word",
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id, size=word_vector_dim)
+    # use bidirectional_gru as the encoder
+    encoded_vector = paddle.networks.bidirectional_gru(
+        input=src_embedding,
+        size=encoder_size,
+        fwd_act=paddle.activation.Tanh(),
+        fwd_gate_act=paddle.activation.Sigmoid(),
+        bwd_act=paddle.activation.Tanh(),
+        bwd_gate_act=paddle.activation.Sigmoid(),
+        return_seq=True)
+    #### Decoder
+    encoder_last = paddle.layer.last_seq(input=encoded_vector)
+    encoder_last_projected = paddle.layer.fc(
+        size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
+    # gru step
+    def gru_decoder_without_attention(enc_vec, current_word):
+        """
+        Step function for gru decoder
+        :param enc_vec: encoded vector of source language
+        :type enc_vec: layer object
+        :param current_word: current input of decoder
+        :type current_word: layer object
+        """
+        decoder_mem = paddle.layer.memory(
+            name="gru_decoder",
+            size=decoder_size,
+            boot_layer=encoder_last_projected)
+        context = paddle.layer.last_seq(input=enc_vec)
+        decoder_inputs = paddle.layer.fc(
+            size=decoder_size * 3, input=[context, current_word])
+        gru_step = paddle.layer.gru_step(
+            name="gru_decoder",
+            act=paddle.activation.Tanh(),
+            gate_act=paddle.activation.Sigmoid(),
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        out = paddle.layer.fc(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=gru_step)
+        return out
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector)
+    group_inputs = [group_input1]
+    decoder_group_name = "decoder_group"
+    if is_generating:
+        trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name="_target_language_embedding",
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_without_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+        return beam_gen
+    else:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name="target_language_word",
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
+        group_inputs.append(trg_embedding)
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_without_attention,
+            input=group_inputs)
+        lbl = paddle.layer.data(
+            name="target_language_next_word",
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+        return cost
--- a/nmt_without_attention/train.py
+++ b/nmt_without_attention/train.py
+#!/usr/bin/env python
+import os
+import logging
+import paddle.v2 as paddle
+from network_conf import seq2seq_net
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def train(save_dir_path, source_dict_dim, target_dict_dim):
+    '''
+    Training function for NMT
+    :param save_dir_path: path of the directory to save the trained models.
+    :param save_dir_path: str
+    :param source_dict_dim: size of source dictionary
+    :type source_dict_dim: int
+    :param target_dict_dim: size of target dictionary
+    :type target_dict_dim: int
+    '''
+    if not os.path.exists(save_dir_path):
+        os.mkdir(save_dir_path)
+    # initialize PaddlePaddle
+    paddle.init(use_gpu=False, trainer_count=1)
+    cost = seq2seq_net(source_dict_dim, target_dict_dim)
+    parameters = paddle.parameters.create(cost)
+    # define optimization method and the trainer instance
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+    # define data reader
+    wmt14_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192),
+        batch_size=8)
+    # define the event_handler callback
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if not event.batch_id % 100 and event.batch_id:
+                with gzip.open(
+                        os.path.join(save_path,
+                                     "nmt_without_att_%05d_batch_%05d.tar.gz" %
+                                     event.pass_id, event.batch_id), "w") as f:
+                    parameters.to_tar(f)
+            if event.batch_id and not event.batch_id % 10:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+    # start training
+    trainer.train(
+        reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+if __name__ == '__main__':
+    train(save_dir_path="models", source_dict_dim=3000, target_dict_dim=3000)
--- a/ntm_addressing_mechanism/README.md
+++ b/ntm_addressing_mechanism/README.md
-TBD
--- a/regression/README.md
+++ b/regression/README.md
-TBD
--- a/scheduled_sampling/README.md
+++ b/scheduled_sampling/README.md
-TBD
+# Scheduled Sampling
+## 概述
+序列生成任务的生成目标是在给定源输入的条件下，最大化目标序列的概率。训练时该模型将目标序列中的真实元素作为解码器每一步的输入，然后最大化下一个元素的概率。生成时上一步解码得到的元素被用作当前的输入，然后生成下一个元素。可见这种情况下训练阶段和生成阶段的解码器输入数据的概率分布并不一致。
+Scheduled Sampling \[[1](#参考文献)\]是一种解决训练和生成时输入数据分布不一致的方法。在训练早期该方法主要使用目标序列中的真实元素作为解码器输入，可以将模型从随机初始化的状态快速引导至一个合理的状态。随着训练的进行，该方法会逐渐更多地使用生成的元素作为解码器输入，以解决数据分布不一致的问题。
+标准的序列到序列模型中，如果序列前面生成了错误的元素，后面的输入状态将会收到影响，而该误差会随着生成过程不断向后累积。Scheduled Sampling以一定概率将生成的元素作为解码器输入，这样即使前面生成错误，其训练目标仍然是最大化真实目标序列的概率，模型会朝着正确的方向进行训练。因此这种方式增加了模型的容错能力。
+## 算法简介
+Scheduled Sampling主要应用在序列到序列模型的训练阶段，而生成阶段则不需要使用。
+训练阶段解码器在最大化第$t$个元素概率时，标准序列到序列模型使用上一时刻的真实元素$y_{t-1}$作为输入。设上一时刻生成的元素为$g_{t-1}$，Scheduled Sampling算法会以一定概率使用$g_{t-1}$作为解码器输入。
+设当前已经训练到了第$i$个mini-batch，Scheduled Sampling定义了一个概率$\epsilon_i$控制解码器的输入。$\epsilon_i$是一个随着$i$增大而衰减的变量，常见的定义方式有：
+ - 线性衰减：$\epsilon_i=max(\epsilon,k-c*i)$，其中$\epsilon$限制$\epsilon_i$的最小值，$k$和$c$控制线性衰减的幅度。
+ - 指数衰减：$\epsilon_i=k^i$，其中$0<k<1$，$k$控制着指数衰减的幅度。
+ - 反向Sigmoid衰减：$\epsilon_i=k/(k+exp(i/k))$，其中$k>1$，$k$同样控制衰减的幅度。
+图1给出了这三种方式的衰减曲线，
+<p align="center">
+<img src="images/decay.jpg" width="50%" align="center"><br>
+图1. 线性衰减、指数衰减和反向Sigmoid衰减的衰减曲线
+</p>
+如图2所示，在解码器的$t$时刻Scheduled Sampling以概率$\epsilon_i$使用上一时刻的真实元素$y_{t-1}$作为解码器输入，以概率$1-\epsilon_i$使用上一时刻生成的元素$g_{t-1}$作为解码器输入。从图1可知随着$i$的增大$\epsilon_i$会不断减小，解码器将不断倾向于使用生成的元素作为输入，训练阶段和生成阶段的数据分布将变得越来越一致。
+<p align="center">
+<img src="images/Scheduled_Sampling.jpg" width="50%" align="center"><br>
+图2. Scheduled Sampling选择不同元素作为解码器输入示意图
+</p>
+## 模型实现
+由于Scheduled Sampling是对序列到序列模型的改进，其整体实现框架与序列到序列模型较为相似。为突出本文重点，这里仅介绍与Scheduled Sampling相关的部分，完整的代码见`scheduled_sampling.py`。
+首先导入需要的包，并定义控制衰减概率的类`RandomScheduleGenerator`，如下：
+```python
+import numpy as np
+import math
+class RandomScheduleGenerator:
+    """
+    The random sampling rate for scheduled sampling algoithm, which uses devcayed
+    sampling rate.
+    """
+    ...
+```
+下面将分别定义类`RandomScheduleGenerator`的`__init__`、`getScheduleRate`和`processBatch`三个方法。
+`__init__`方法对类进行初始化，其`schedule_type`参数指定了使用哪种衰减方式，可选的方式有`constant`、`linear`、`exponential`和`inverse_sigmoid`。`constant`指对所有的mini-batch使用固定的$\epsilon_i$，`linear`指线性衰减方式，`exponential`表示指数衰减方式，`inverse_sigmoid`表示反向Sigmoid衰减。`__init__`方法的参数`a`和`b`表示衰减方法的参数，需要在验证集上调优。`self.schedule_computers`将衰减方式映射为计算$\epsilon_i$的函数。最后一行根据`schedule_type`将选择的衰减函数赋给`self.schedule_computer`变量。
+```python
+    def __init__(self, schedule_type, a, b):
+        """
+        schduled_type: is the type of the decay. It supports constant, linear,
+        exponential, and inverse_sigmoid right now.
+        a: parameter of the decay (MUST BE DOUBLE)
+        b: parameter of the decay (MUST BE DOUBLE)
+        """
+        self.schedule_type = schedule_type
+        self.a = a
+        self.b = b
+        self.data_processed_ = 0
+        self.schedule_computers = {
+            "constant": lambda a, b, d: a,
+            "linear": lambda a, b, d: max(a, 1 - d / b),
+            "exponential": lambda a, b, d: pow(a, d / b),
+            "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
+        }
+        assert (self.schedule_type in self.schedule_computers)
+        self.schedule_computer = self.schedule_computers[self.schedule_type]
+```
+`getScheduleRate`根据衰减函数和已经处理的数据量计算$\epsilon_i$。
+```python
+    def getScheduleRate(self):
+        """
+        Get the schedule sampling rate. Usually not needed to be called by the users
+        """
+        return self.schedule_computer(self.a, self.b, self.data_processed_)
+```
+`processBatch`方法根据概率值$\epsilon_i$进行采样，得到`indexes`，`indexes`中每个元素取值为`0`的概率为$\epsilon_i$，取值为`1`的概率为$1-\epsilon_i$。`indexes`决定了解码器的输入是真实元素还是生成的元素，取值为`0`表示使用真实元素，取值为`1`表示使用生成的元素。
+```python
+    def processBatch(self, batch_size):
+        """
+        Get a batch_size of sampled indexes. These indexes can be passed to a
+        MultiplexLayer to select from the grouth truth and generated samples
+        from the last time step.
+        """
+        rate = self.getScheduleRate()
+        numbers = np.random.rand(batch_size)
+        indexes = (numbers >= rate).astype('int32').tolist()
+        self.data_processed_ += batch_size
+        return indexes
+```
+Scheduled Sampling需要在序列到序列模型的基础上增加一个输入`true_token_flag`，以控制解码器输入。
+```python
+true_token_flags = paddle.layer.data(
+    name='true_token_flag',
+    type=paddle.data_type.integer_value_sequence(2))
+```
+这里还需要对原始reader进行封装，增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。
+```python
+schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000)
+def gen_schedule_data(reader):
+    """
+    Creates a data reader for scheduled sampling.
+    Output from the iterator that created by original reader will be
+    appended with "true_token_flag" to indicate whether to use true token.
+    :param reader: the original reader.
+    :type reader: callable
+    :return: the new reader with the field "true_token_flag".
+    :rtype: callable
+    """
+    def data_reader():
+        for src_ids, trg_ids, trg_ids_next in reader():
+            yield src_ids, trg_ids, trg_ids_next, \
+                  [0] + schedule_generator.processBatch(len(trg_ids) - 1)
+    return data_reader
+```
+这段代码在原始输入数据（即源序列元素`src_ids`、目标序列元素`trg_ids`和目标序列下一个元素`trg_ids_next`）后追加了控制解码器输入的数据。由于解码器第一个元素是序列开始符，因此将追加的数据第一个元素设置为`0`，表示解码器第一步始终使用真实目标序列的第一个元素（即序列开始符）。
+训练时`recurrent_group`每一步调用的解码器函数如下：
+```python
+    def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
+                                         true_token_flag):
+        """
+        The decoder step for training.
+        :param enc_vec: the encoder vector for attention
+        :type enc_vec: LayerOutput
+        :param enc_proj: the encoder projection for attention
+        :type enc_proj: LayerOutput
+        :param true_word: the ground-truth target word
+        :type true_word: LayerOutput
+        :param true_token_flag: the flag of using the ground-truth target word
+        :type true_token_flag: LayerOutput
+        :return: the softmax output layer
+        :rtype: LayerOutput
+        """
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        gru_out_memory = paddle.layer.memory(
+            name='gru_out', size=target_dict_dim)
+        generated_word = paddle.layer.max_id(input=gru_out_memory)
+        generated_word_emb = paddle.layer.embedding(
+            input=generated_word,
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        current_word = paddle.layer.multiplex(
+            input=[true_token_flag, true_word, generated_word_emb])
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        with paddle.layer.mixed(
+                name='gru_out',
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+```
+该函数使用`memory`层`gru_out_memory`记忆上一时刻生成的元素，根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择，并将选择的结果作为解码器输入。`multiplex`层使用了三个输入，分别为`true_token_flag`、`true_word`和`generated_word_emb`。对于这三个输入中每个元素，若`true_token_flag`中的值为`0`，则`multiplex`层输出`true_word`中的相应元素；若`true_token_flag`中的值为`1`，则`multiplex`层输出`generated_word_emb`中的相应元素。
+## 参考文献
+[1] Bengio S, Vinyals O, Jaitly N, et al. [Scheduled sampling for sequence prediction with recurrent neural networks](http://papers.nips.cc/paper/5956-scheduled-sampling-for-sequence-prediction-with-recurrent-neural-networks)//Advances in Neural Information Processing Systems. 2015: 1171-1179.
--- a/scheduled_sampling/images/Scheduled_Sampling.jpg
+++ b/scheduled_sampling/images/Scheduled_Sampling.jpg
--- a/scheduled_sampling/images/decay.jpg
+++ b/scheduled_sampling/images/decay.jpg
--- a/scheduled_sampling/index.html
+++ b/scheduled_sampling/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Scheduled Sampling
+## 概述
+序列生成任务的生成目标是在给定源输入的条件下，最大化目标序列的概率。训练时该模型将目标序列中的真实元素作为解码器每一步的输入，然后最大化下一个元素的概率。生成时上一步解码得到的元素被用作当前的输入，然后生成下一个元素。可见这种情况下训练阶段和生成阶段的解码器输入数据的概率分布并不一致。
+Scheduled Sampling \[[1](#参考文献)\]是一种解决训练和生成时输入数据分布不一致的方法。在训练早期该方法主要使用目标序列中的真实元素作为解码器输入，可以将模型从随机初始化的状态快速引导至一个合理的状态。随着训练的进行，该方法会逐渐更多地使用生成的元素作为解码器输入，以解决数据分布不一致的问题。
+标准的序列到序列模型中，如果序列前面生成了错误的元素，后面的输入状态将会收到影响，而该误差会随着生成过程不断向后累积。Scheduled Sampling以一定概率将生成的元素作为解码器输入，这样即使前面生成错误，其训练目标仍然是最大化真实目标序列的概率，模型会朝着正确的方向进行训练。因此这种方式增加了模型的容错能力。
+## 算法简介
+Scheduled Sampling主要应用在序列到序列模型的训练阶段，而生成阶段则不需要使用。
+训练阶段解码器在最大化第$t$个元素概率时，标准序列到序列模型使用上一时刻的真实元素$y_{t-1}$作为输入。设上一时刻生成的元素为$g_{t-1}$，Scheduled Sampling算法会以一定概率使用$g_{t-1}$作为解码器输入。
+设当前已经训练到了第$i$个mini-batch，Scheduled Sampling定义了一个概率$\epsilon_i$控制解码器的输入。$\epsilon_i$是一个随着$i$增大而衰减的变量，常见的定义方式有：
+ - 线性衰减：$\epsilon_i=max(\epsilon,k-c*i)$，其中$\epsilon$限制$\epsilon_i$的最小值，$k$和$c$控制线性衰减的幅度。
+ - 指数衰减：$\epsilon_i=k^i$，其中$0<k<1$，$k$控制着指数衰减的幅度。
+ - 反向Sigmoid衰减：$\epsilon_i=k/(k+exp(i/k))$，其中$k>1$，$k$同样控制衰减的幅度。
+图1给出了这三种方式的衰减曲线，
+<p align="center">
+<img src="images/decay.jpg" width="50%" align="center"><br>
+图1. 线性衰减、指数衰减和反向Sigmoid衰减的衰减曲线
+</p>
+如图2所示，在解码器的$t$时刻Scheduled Sampling以概率$\epsilon_i$使用上一时刻的真实元素$y_{t-1}$作为解码器输入，以概率$1-\epsilon_i$使用上一时刻生成的元素$g_{t-1}$作为解码器输入。从图1可知随着$i$的增大$\epsilon_i$会不断减小，解码器将不断倾向于使用生成的元素作为输入，训练阶段和生成阶段的数据分布将变得越来越一致。
+<p align="center">
+<img src="images/Scheduled_Sampling.jpg" width="50%" align="center"><br>
+图2. Scheduled Sampling选择不同元素作为解码器输入示意图
+</p>
+## 模型实现
+由于Scheduled Sampling是对序列到序列模型的改进，其整体实现框架与序列到序列模型较为相似。为突出本文重点，这里仅介绍与Scheduled Sampling相关的部分，完整的代码见`scheduled_sampling.py`。
+首先导入需要的包，并定义控制衰减概率的类`RandomScheduleGenerator`，如下：
+```python
+import numpy as np
+import math
+class RandomScheduleGenerator:
+    """
+    The random sampling rate for scheduled sampling algoithm, which uses devcayed
+    sampling rate.
+    """
+    ...
+```
+下面将分别定义类`RandomScheduleGenerator`的`__init__`、`getScheduleRate`和`processBatch`三个方法。
+`__init__`方法对类进行初始化，其`schedule_type`参数指定了使用哪种衰减方式，可选的方式有`constant`、`linear`、`exponential`和`inverse_sigmoid`。`constant`指对所有的mini-batch使用固定的$\epsilon_i$，`linear`指线性衰减方式，`exponential`表示指数衰减方式，`inverse_sigmoid`表示反向Sigmoid衰减。`__init__`方法的参数`a`和`b`表示衰减方法的参数，需要在验证集上调优。`self.schedule_computers`将衰减方式映射为计算$\epsilon_i$的函数。最后一行根据`schedule_type`将选择的衰减函数赋给`self.schedule_computer`变量。
+```python
+    def __init__(self, schedule_type, a, b):
+        """
+        schduled_type: is the type of the decay. It supports constant, linear,
+        exponential, and inverse_sigmoid right now.
+        a: parameter of the decay (MUST BE DOUBLE)
+        b: parameter of the decay (MUST BE DOUBLE)
+        """
+        self.schedule_type = schedule_type
+        self.a = a
+        self.b = b
+        self.data_processed_ = 0
+        self.schedule_computers = {
+            "constant": lambda a, b, d: a,
+            "linear": lambda a, b, d: max(a, 1 - d / b),
+            "exponential": lambda a, b, d: pow(a, d / b),
+            "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
+        }
+        assert (self.schedule_type in self.schedule_computers)
+        self.schedule_computer = self.schedule_computers[self.schedule_type]
+```
+`getScheduleRate`根据衰减函数和已经处理的数据量计算$\epsilon_i$。
+```python
+    def getScheduleRate(self):
+        """
+        Get the schedule sampling rate. Usually not needed to be called by the users
+        """
+        return self.schedule_computer(self.a, self.b, self.data_processed_)
+```
+`processBatch`方法根据概率值$\epsilon_i$进行采样，得到`indexes`，`indexes`中每个元素取值为`0`的概率为$\epsilon_i$，取值为`1`的概率为$1-\epsilon_i$。`indexes`决定了解码器的输入是真实元素还是生成的元素，取值为`0`表示使用真实元素，取值为`1`表示使用生成的元素。
+```python
+    def processBatch(self, batch_size):
+        """
+        Get a batch_size of sampled indexes. These indexes can be passed to a
+        MultiplexLayer to select from the grouth truth and generated samples
+        from the last time step.
+        """
+        rate = self.getScheduleRate()
+        numbers = np.random.rand(batch_size)
+        indexes = (numbers >= rate).astype('int32').tolist()
+        self.data_processed_ += batch_size
+        return indexes
+```
+Scheduled Sampling需要在序列到序列模型的基础上增加一个输入`true_token_flag`，以控制解码器输入。
+```python
+true_token_flags = paddle.layer.data(
+    name='true_token_flag',
+    type=paddle.data_type.integer_value_sequence(2))
+```
+这里还需要对原始reader进行封装，增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。
+```python
+schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000)
+def gen_schedule_data(reader):
+    """
+    Creates a data reader for scheduled sampling.
+    Output from the iterator that created by original reader will be
+    appended with "true_token_flag" to indicate whether to use true token.
+    :param reader: the original reader.
+    :type reader: callable
+    :return: the new reader with the field "true_token_flag".
+    :rtype: callable
+    """
+    def data_reader():
+        for src_ids, trg_ids, trg_ids_next in reader():
+            yield src_ids, trg_ids, trg_ids_next, \
+                  [0] + schedule_generator.processBatch(len(trg_ids) - 1)
+    return data_reader
+```
+这段代码在原始输入数据（即源序列元素`src_ids`、目标序列元素`trg_ids`和目标序列下一个元素`trg_ids_next`）后追加了控制解码器输入的数据。由于解码器第一个元素是序列开始符，因此将追加的数据第一个元素设置为`0`，表示解码器第一步始终使用真实目标序列的第一个元素（即序列开始符）。
+训练时`recurrent_group`每一步调用的解码器函数如下：
+```python
+    def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
+                                         true_token_flag):
+        """
+        The decoder step for training.
+        :param enc_vec: the encoder vector for attention
+        :type enc_vec: LayerOutput
+        :param enc_proj: the encoder projection for attention
+        :type enc_proj: LayerOutput
+        :param true_word: the ground-truth target word
+        :type true_word: LayerOutput
+        :param true_token_flag: the flag of using the ground-truth target word
+        :type true_token_flag: LayerOutput
+        :return: the softmax output layer
+        :rtype: LayerOutput
+        """
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        gru_out_memory = paddle.layer.memory(
+            name='gru_out', size=target_dict_dim)
+        generated_word = paddle.layer.max_id(input=gru_out_memory)
+        generated_word_emb = paddle.layer.embedding(
+            input=generated_word,
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        current_word = paddle.layer.multiplex(
+            input=[true_token_flag, true_word, generated_word_emb])
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        with paddle.layer.mixed(
+                name='gru_out',
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+```
+该函数使用`memory`层`gru_out_memory`记忆上一时刻生成的元素，根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择，并将选择的结果作为解码器输入。`multiplex`层使用了三个输入，分别为`true_token_flag`、`true_word`和`generated_word_emb`。对于这三个输入中每个元素，若`true_token_flag`中的值为`0`，则`multiplex`层输出`true_word`中的相应元素；若`true_token_flag`中的值为`1`，则`multiplex`层输出`generated_word_emb`中的相应元素。
+## 参考文献
+[1] Bengio S, Vinyals O, Jaitly N, et al. [Scheduled sampling for sequence prediction with recurrent neural networks](http://papers.nips.cc/paper/5956-scheduled-sampling-for-sequence-prediction-with-recurrent-neural-networks)//Advances in Neural Information Processing Systems. 2015: 1171-1179.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/scheduled_sampling/random_schedule_generator.py
+++ b/scheduled_sampling/random_schedule_generator.py
+import numpy as np
+import math
+class RandomScheduleGenerator:
+    """
+    The random sampling rate for scheduled sampling algoithm, which uses devcayed
+    sampling rate.
+    """
+    def __init__(self, schedule_type, a, b):
+        """
+        schduled_type: is the type of the decay. It supports constant, linear,
+        exponential, and inverse_sigmoid right now.
+        a: parameter of the decay (MUST BE DOUBLE)
+        b: parameter of the decay (MUST BE DOUBLE)
+        """
+        self.schedule_type = schedule_type
+        self.a = a
+        self.b = b
+        self.data_processed_ = 0
+        self.schedule_computers = {
+            "constant": lambda a, b, d: a,
+            "linear": lambda a, b, d: max(a, 1 - d / b),
+            "exponential": lambda a, b, d: pow(a, d / b),
+            "inverse_sigmoid": lambda a, b, d: b / (b + math.exp(d * a / b)),
+        }
+        assert (self.schedule_type in self.schedule_computers)
+        self.schedule_computer = self.schedule_computers[self.schedule_type]
+    def getScheduleRate(self):
+        """
+        Get the schedule sampling rate. Usually not needed to be called by the users
+        """
+        return self.schedule_computer(self.a, self.b, self.data_processed_)
+    def processBatch(self, batch_size):
+        """
+        Get a batch_size of sampled indexes. These indexes can be passed to a
+        MultiplexLayer to select from the grouth truth and generated samples
+        from the last time step.
+        """
+        rate = self.getScheduleRate()
+        numbers = np.random.rand(batch_size)
+        indexes = (numbers >= rate).astype('int32').tolist()
+        self.data_processed_ += batch_size
+        return indexes
--- a/scheduled_sampling/scheduled_sampling.py
+++ b/scheduled_sampling/scheduled_sampling.py
+import sys
+import paddle.v2 as paddle
+from random_schedule_generator import RandomScheduleGenerator
+schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000)
+def gen_schedule_data(reader):
+    """
+    Creates a data reader for scheduled sampling.
+    Output from the iterator that created by original reader will be
+    appended with "true_token_flag" to indicate whether to use true token.
+    :param reader: the original reader.
+    :type reader: callable
+    :return: the new reader with the field "true_token_flag".
+    :rtype: callable
+    """
+    def data_reader():
+        for src_ids, trg_ids, trg_ids_next in reader():
+            yield src_ids, trg_ids, trg_ids_next, \
+                  [0] + schedule_generator.processBatch(len(trg_ids) - 1)
+    return data_reader
+def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
+    """
+    The definition of the sequence to sequence model
+    :param source_dict_dim: the dictionary size of the source language
+    :type source_dict_dim: int
+    :param target_dict_dim: the dictionary size of the target language
+    :type target_dict_dim: int
+    :param is_generating: whether in generating mode
+    :type is_generating: Bool
+    :return: the last layer of the network
+    :rtype: LayerOutput
+    """
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
+    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
+    beam_size = 3
+    max_length = 250
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id, size=word_vector_dim)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+    #### Decoder
+    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
+        encoded_proj += paddle.layer.full_matrix_projection(
+            input=encoded_vector)
+    backward_first = paddle.layer.first_seq(input=src_backward)
+    with paddle.layer.mixed(
+            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+        decoder_boot += paddle.layer.full_matrix_projection(
+            input=backward_first)
+    def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word,
+                                         true_token_flag):
+        """
+        The decoder step for training.
+        :param enc_vec: the encoder vector for attention
+        :type enc_vec: LayerOutput
+        :param enc_proj: the encoder projection for attention
+        :type enc_proj: LayerOutput
+        :param true_word: the ground-truth target word
+        :type true_word: LayerOutput
+        :param true_token_flag: the flag of using the ground-truth target word
+        :type true_token_flag: LayerOutput
+        :return: the softmax output layer
+        :rtype: LayerOutput
+        """
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        gru_out_memory = paddle.layer.memory(
+            name='gru_out', size=target_dict_dim)
+        generated_word = paddle.layer.max_id(input=gru_out_memory)
+        generated_word_emb = paddle.layer.embedding(
+            input=generated_word,
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        current_word = paddle.layer.multiplex(
+            input=[true_token_flag, true_word, generated_word_emb])
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        with paddle.layer.mixed(
+                name='gru_out',
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+    def gru_decoder_with_attention_test(enc_vec, enc_proj, current_word):
+        """
+        The decoder step for generating.
+        :param enc_vec: the encoder vector for attention
+        :type enc_vec: LayerOutput
+        :param enc_proj: the encoder projection for attention
+        :type enc_proj: LayerOutput
+        :param current_word: the previously generated word
+        :type current_word: LayerOutput
+        :return: the softmax output layer
+        :rtype: LayerOutput
+        """
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+        with paddle.layer.mixed(
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+        true_token_flags = paddle.layer.data(
+            name='true_token_flag',
+            type=paddle.data_type.integer_value_sequence(2))
+        group_inputs.append(true_token_flags)
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention_train,
+            input=group_inputs)
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+        return cost
+    else:
+        trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention_test,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+        return beam_gen
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = False
+    model_path_for_generating = 'params_pass_1.tar.gz'
+    # source and target dict dim.
+    dict_size = 30000
+    source_dict_dim = target_dict_dim = dict_size
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(
+            cost=cost, parameters=parameters, update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            gen_schedule_data(
+                paddle.reader.shuffle(
+                    paddle.dataset.wmt14.train(dict_size), buf_size=8192)),
+            batch_size=5)
+        feeding = {
+            'source_language_word': 0,
+            'target_language_word': 1,
+            'target_language_next_word': 2,
+            'true_token_flag': 3
+        }
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+            if isinstance(event, paddle.event.EndPass):
+                # save parameters
+                with gzip.open('params_pass_%d.tar.gz' % event.pass_id,
+                               'w') as f:
+                    parameters.to_tar(f)
+        # start to train
+        trainer.train(
+            reader=wmt14_reader,
+            event_handler=event_handler,
+            feeding=feeding,
+            num_passes=2)
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        # get the trained model
+        with gzip.open(model_path_for_generating, 'r') as f:
+            parameters = Parameters.from_tar(f)
+        # prob is the prediction probabilities, and id is the prediction word.
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+        # get the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+        # the delimited element of generated sequences is -1,
+        # the first element of each generated sequence is the sequence length
+        seq_list = []
+        seq = []
+        for w in beam_result[1]:
+            if w != -1:
+                seq.append(w)
+            else:
+                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
+                seq = []
+        prob = beam_result[0]
+        beam_size = 3
+        for i in xrange(gen_num):
+            print "\n*******************************************************\n"
+            print "src:", ' '.join(
+                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+            for j in xrange(beam_size):
+                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+if __name__ == '__main__':
+    main()
--- a/seq2seq/README.md
+++ b/seq2seq/README.md
-TBD
--- a/sequence_tagging_for_ner/.gitignore
+++ b/sequence_tagging_for_ner/.gitignore
+*.pyc
+*.tar.gz
--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
-TBD
+# 命名实体识别
+以下是本例的简要目录结构及说明：
+```text
+.
+├── data                 # 存储运行本例所依赖的数据
+│   ├── download.sh
+├── images               # README 文档中的图片
+├── index.html
+├── infer.py             # 测试脚本
+├── network_conf.py      # 模型定义
+├── reader.py            # 数据读取接口
+├── README.md            # 文档
+├── train.py             # 训练脚本
+└── utils.py             # 定义同样的函数
+```
+## 简介
+命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等，是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分，可以将其作为序列标注问题解决。
+序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
+<div  align="center">
+<img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
+图1. BIO标注方法示例
+</div>
+根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
+由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等 序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
+## 模型详解
+NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些简单的预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。模型如图2所示，工作流程如下：
+1. 构造输入
+ - 输入1是句子序列，采用one-hot方式表示
+ - 输入2是大写标记序列，标记了句子中每一个词是否是大写，采用one-hot方式表示；
+2. one-hot方式的句子序列和大写标记序列通过词表，转换为实向量表示的词向量序列；
+3. 将步骤2中的2个词向量序列作为双向RNN的输入，学习输入序列的特征表示，得到新的特性表示序列；
+4. CRF以步骤3中模型学习到的特征为输入，以标记序列为监督信号，实现序列标注。
+<div  align="center">  
+<img src="images/ner_network.png" width = "40%"  align=center /><br>
+图2. NER 模型网络结构图
+</div>
+## 数据说明
+在本例中，我们以 [CoNLL 2003 NER任务](http://www.clips.uantwerpen.be/conll2003/ner/)为例，原始Reuters数据由于版权原因需另外申请免费下载，请大家按照原网站说明获取。
+ 我们仅在`data`目录下的`train`和`test`文件中放置少数样本用以示例输入数据格式。
+ 本例依赖数据还包括
+    1. 输入文本的词典
+    2. 为词典中的词语提供预训练好的词向量
+    2. 标记标签的词典
+   标记标签词典已附在`data`目录中，对应于`data/target.txt`文件。输入文本的词典以及词典中词语的预训练的词向量来自：[Stanford CS224d](http://cs224d.stanford.edu/)课程作业。**为运行本例，请首先在`data`目录下运行`download.sh`脚本下载输入文本的词典和预训练的词向量。** 完成后会将这两个文件一并放入`data`目录下，输入文本的词典和预训练的词向量分别对应：`data/vocab.txt`和`data/wordVectors.txt`这两个文件。
+CoNLL 2003原始数据格式如下：
+```
+U.N.         NNP  I-NP  I-ORG
+official     NN   I-NP  O
+Ekeus        NNP  I-NP  I-PER
+heads        VBZ  I-VP  O
+for          IN   I-PP  O
+Baghdad      NNP  I-NP  I-LOC
+.            .    O     O
+```
+- 第一列为原始句子序列
+- 第二、三列分别为词性标签和句法分析中的语块标签，本例不使用
+- 第四列为采用了 I-TYPE 方式表示的NER标签
+    - I-TYPE 和 BIO 方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
+我们在`reader.py`脚本中完成对原始数据的处理以及读取，主要包括下面几个步骤:
+1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列；
+2. 将 I-TYPE 表示的标签转换为 BIO 方式表示的标签；
+3. 将句子序列中的单词转换为小写，并构造大写标记序列；
+4. 依据词典获取词对应的整数索引。
+预处理完成后，一条训练样本包含3个部分作为神经网络的输入信息用于训练：（1）句子序列；（2）首字母大写标记序列；（3）标注序列，下表是一条训练样本的示例：
+| 句子序列 | 大写标记序列 | 标注序列 |
+|---|---|---|
+| u.n. | 1 | B-ORG |
+| official | 0 | O |
+| ekeus | 1 | B-PER |
+| heads | 0 | O |
+| for | 0 | O |
+| baghdad | 1 | B-LOC |
+| . | 0 | O |
+## 运行
+### 编写数据读取接口
+自定义数据读取接口只需编写一个 Python 生成器实现从原始输入文本中解析一条训练样本的逻辑。[reader.py](./reader.py) 中的`data_reader`函数实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`的 3 个输入（分别对应：词语在字典的序号、是否为大写、标注结果在字典中的序号）给`network_conf.ner_net`中定义的 3 个 `data_layer` 的功能。
+### 训练
+1. 运行 `sh data/download.sh`
+2. 修改 `train.py` 的 `main` 函数，指定数据路径
+    ```python
+    main(
+        train_data_file='data/train',
+        test_data_file='data/test',
+        vocab_file='data/vocab.txt',
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
+    ```
+3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**
+    ```text
+    commandline:  --use_gpu=False --trainer_count=1
+    Initing parameters..
+    Init parameters done.
+    Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
+    Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
+    ```
+### 预测
+1. 修改 [infer.py](./infer.py) 的 `main` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
+    ```python
+    infer(
+        model_path="models/params_pass_0.tar.gz",
+        batch_size=2,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt")
+    ```
+2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练500个pass所得模型的部分预测结果）：
+    ```text
+    cricket             O
+    -                   O
+    leicestershire      B-ORG
+    take                O
+    over                O
+    at                  O
+    top                 O
+    after               O
+    innings             O
+    victory             O
+    .                   O
+    london              B-LOC
+    1996-08-30          O
+    west                B-MISC
+    indian              I-MISC
+    all-rounder         O
+    phil                B-PER
+    simmons             I-PER
+    took                O
+    four                O
+    ```
+    输出分为两列，以“\t” 分隔，第一列是输入的词语，第二列是标记结果。多条输入序列之间以空行分隔。
+## 参考文献
+1. Graves A. [Supervised Sequence Labelling with Recurrent Neural Networks](http://www.cs.toronto.edu/~graves/preprint.pdf)[J]. Studies in Computational Intelligence, 2013, 385.
+2. Collobert R, Weston J, Bottou L, et al. [Natural Language Processing (Almost) from Scratch](http://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf)[J]. Journal of Machine Learning Research, 2011, 12(1):2493-2537.
--- a/sequence_tagging_for_ner/data/download.sh
+++ b/sequence_tagging_for_ner/data/download.sh
+wget http://cs224d.stanford.edu/assignment2/assignment2.zip
+if [ $? -eq 0  ];then
+    unzip assignment2.zip
+    cp assignment2_release/data/ner/wordVectors.txt ./data
+    cp assignment2_release/data/ner/vocab.txt ./data
+    rm -rf assignment2.zip assignment2_release
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi
--- a/sequence_tagging_for_ner/data/target.txt
+++ b/sequence_tagging_for_ner/data/target.txt
+B-LOC
+I-LOC
+B-MISC
+I-MISC
+B-ORG
+I-ORG
+B-PER
+I-PER
+O
--- a/sequence_tagging_for_ner/data/test
+++ b/sequence_tagging_for_ner/data/test
+CRICKET NNP I-NP O
+- : O O
+LEICESTERSHIRE NNP I-NP I-ORG
+TAKE NNP I-NP O
+OVER IN I-PP O
+AT NNP I-NP O
+TOP NNP I-NP O
+AFTER NNP I-NP O
+INNINGS NNP I-NP O
+VICTORY NN I-NP O
+. . O O
+LONDON NNP I-NP I-LOC
+1996-08-30 CD I-NP O
+West NNP I-NP I-MISC
+Indian NNP I-NP I-MISC
+all-rounder NN I-NP O
+Phil NNP I-NP I-PER
+Simmons NNP I-NP I-PER
+took VBD I-VP O
+four CD I-NP O
+for IN I-PP O
+38 CD I-NP O
+on IN I-PP O
+Friday NNP I-NP O
+as IN I-PP O
+Leicestershire NNP I-NP I-ORG
+beat VBD I-VP O
+Somerset NNP I-NP I-ORG
+by IN I-PP O
+an DT I-NP O
+innings NN I-NP O
+and CC O O
+39 CD I-NP O
+runs NNS I-NP O
+in IN I-PP O
+two CD I-NP O
+days NNS I-NP O
+to TO I-VP O
+take VB I-VP O
+over IN I-PP O
+at IN B-PP O
+the DT I-NP O
+head NN I-NP O
+of IN I-PP O
+the DT I-NP O
+county NN I-NP O
+championship NN I-NP O
+. . O O
+Their PRP$ I-NP O
+stay NN I-NP O
+on IN I-PP O
+top NN I-NP O
+, , O O
+though RB I-ADVP O
+, , O O
+may MD I-VP O
+be VB I-VP O
+short-lived JJ I-ADJP O
+as IN I-PP O
+title NN I-NP O
+rivals NNS I-NP O
+Essex NNP I-NP I-ORG
+, , O O
+Derbyshire NNP I-NP I-ORG
+and CC I-NP O
+Surrey NNP I-NP I-ORG
+all DT O O
+closed VBD I-VP O
+in RP I-PRT O
+on IN I-PP O
+victory NN I-NP O
+while IN I-SBAR O
+Kent NNP I-NP I-ORG
+made VBD I-VP O
+up RP I-PRT O
+for IN I-PP O
+lost VBN I-NP O
+time NN I-NP O
+in IN I-PP O
+their PRP$ I-NP O
+rain-affected JJ I-NP O
+match NN I-NP O
+against IN I-PP O
+Nottinghamshire NNP I-NP I-ORG
+. . O O
+After IN I-PP O
+bowling VBG I-NP O
+Somerset NNP I-NP I-ORG
+out RP I-PRT O
+for IN I-PP O
+83 CD I-NP O
+on IN I-PP O
+the DT I-NP O
+opening NN I-NP O
+morning NN I-NP O
+at IN I-PP O
+Grace NNP I-NP I-LOC
+Road NNP I-NP I-LOC
+, , O O
+Leicestershire NNP I-NP I-ORG
+extended VBD I-VP O
+their PRP$ I-NP O
+first JJ I-NP O
+innings NN I-NP O
+by IN I-PP O
+94 CD I-NP O
+runs VBZ I-VP O
+before IN I-PP O
+being VBG I-VP O
+bowled VBD I-VP O
+out RP I-PRT O
+for IN I-PP O
+296 CD I-NP O
+with IN I-PP O
+England NNP I-NP I-LOC
+discard VBP I-VP O
+Andy NNP I-NP I-PER
+Caddick NNP I-NP I-PER
+taking VBG I-VP O
+three CD I-NP O
+for IN I-PP O
+83 CD I-NP O
+. . O O
--- a/sequence_tagging_for_ner/data/train
+++ b/sequence_tagging_for_ner/data/train
+EU NNP I-NP I-ORG
+rejects VBZ I-VP O
+German JJ I-NP I-MISC
+call NN I-NP O
+to TO I-VP O
+boycott VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+. . O O
+Peter NNP I-NP I-PER
+Blackburn NNP I-NP I-PER
+BRUSSELS NNP I-NP I-LOC
+1996-08-22 CD I-NP O
+The DT I-NP O
+European NNP I-NP I-ORG
+Commission NNP I-NP I-ORG
+said VBD I-VP O
+on IN I-PP O
+Thursday NNP I-NP O
+it PRP B-NP O
+disagreed VBD I-VP O
+with IN I-PP O
+German JJ I-NP I-MISC
+advice NN I-NP O
+to TO I-PP O
+consumers NNS I-NP O
+to TO I-VP O
+shun VB I-VP O
+British JJ I-NP I-MISC
+lamb NN I-NP O
+until IN I-SBAR O
+scientists NNS I-NP O
+determine VBP I-VP O
+whether IN I-SBAR O
+mad JJ I-NP O
+cow NN I-NP O
+disease NN I-NP O
+can MD I-VP O
+be VB I-VP O
+transmitted VBN I-VP O
+to TO I-PP O
+sheep NN I-NP O
+. . O O
+Germany NNP I-NP I-LOC
+'s POS B-NP O
+representative NN I-NP O
+to TO I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+'s POS B-NP O
+veterinary JJ I-NP O
+committee NN I-NP O
+Werner NNP I-NP I-PER
+Zwingmann NNP I-NP I-PER
+said VBD I-VP O
+on IN I-PP O
+Wednesday NNP I-NP O
+consumers NNS I-NP O
+should MD I-VP O
+buy VB I-VP O
+sheepmeat NN I-NP O
+from IN I-PP O
+countries NNS I-NP O
+other JJ I-ADJP O
+than IN I-PP O
+Britain NNP I-NP I-LOC
+until IN I-SBAR O
+the DT I-NP O
+scientific JJ I-NP O
+advice NN I-NP O
+was VBD I-VP O
+clearer JJR I-ADJP O
+. . O O
+" " O O
+We PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+support VB I-VP O
+any DT I-NP O
+such JJ I-NP O
+recommendation NN I-NP O
+because IN I-SBAR O
+we PRP I-NP O
+do VBP I-VP O
+n't RB I-VP O
+see VB I-VP O
+any DT I-NP O
+grounds NNS I-NP O
+for IN I-PP O
+it PRP I-NP O
+, , O O
+" " O O
+the DT I-NP O
+Commission NNP I-NP I-ORG
+'s POS B-NP O
+chief JJ I-NP O
+spokesman NN I-NP O
+Nikolaus NNP I-NP I-PER
+van NNP I-NP I-PER
+der FW I-NP I-PER
+Pas NNP I-NP I-PER
+told VBD I-VP O
+a DT I-NP O
+news NN I-NP O
+briefing NN I-NP O
+. . O O
+He PRP I-NP O
+said VBD I-VP O
+further JJ I-NP O
+scientific JJ I-NP O
+study NN I-NP O
+was VBD I-VP O
+required VBN I-VP O
+and CC O O
+if IN I-SBAR O
+it PRP I-NP O
+was VBD I-VP O
+found VBN I-VP O
+that IN I-SBAR O
+action NN I-NP O
+was VBD I-VP O
+needed VBN I-VP O
+it PRP I-NP O
+should MD I-VP O
+be VB I-VP O
+taken VBN I-VP O
+by IN I-PP O
+the DT I-NP O
+European NNP I-NP I-ORG
+Union NNP I-NP I-ORG
+. . O O
--- a/sequence_tagging_for_ner/data/vocab.txt
+++ b/sequence_tagging_for_ner/data/vocab.txt
--- a/sequence_tagging_for_ner/images/ner_label_ins.png
+++ b/sequence_tagging_for_ner/images/ner_label_ins.png
--- a/sequence_tagging_for_ner/images/ner_network.png
+++ b/sequence_tagging_for_ner/images/ner_network.png
--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 命名实体识别
+以下是本例的简要目录结构及说明：
+```text
+.
+├── data                 # 存储运行本例所依赖的数据
+│   ├── download.sh
+├── images               # README 文档中的图片
+├── index.html
+├── infer.py             # 测试脚本
+├── network_conf.py      # 模型定义
+├── reader.py            # 数据读取接口
+├── README.md            # 文档
+├── train.py             # 训练脚本
+└── utils.py             # 定义同样的函数
+```
+## 简介
+命名实体识别（Named Entity Recognition，NER）又称作“专名识别”，是指识别文本中具有特定意义的实体，主要包括人名、地名、机构名、专有名词等，是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分，可以将其作为序列标注问题解决。
+序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)]，本例只考虑Segment Classification，即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务，由于需要标识边界，一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集，如下是一个NER的标注结果示例：
+<div  align="center">
+<img src="images/ner_label_ins.png" width = "80%"  align=center /><br>
+图1. BIO标注方法示例
+</div>
+根据序列标注结果可以直接得到实体边界和实体类别。类似的，分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是：前层网络学习输入的特征表示，网络的最后一层在特征基础上完成最终的任务；对于序列标注问题，通常：使用基于RNN的网络结构学习特征，将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是：CRF使用句子级别的似然概率，能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然，这里以NER任务作为示例，但所给出的模型可以应用到其他各种序列标注任务中。
+由于序列标注问题的广泛性，产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型，这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展，循环神经网络（Recurrent Neural Network，RNN等 序列模型能够处理序列元素之间前后关联问题，能够从原始输入文本中学习特征表示，而更加适合序列标注任务，更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
+## 模型详解
+NER任务的输入是"一句话"，目标是识别句子中的实体边界及类别，我们参照论文\[[2](#参考文献)\]仅对原始句子进行了一些简单的预处理工作：将每个词转换为小写，并将原词是否大写另作为一个特征，共同作为模型的输入。模型如图2所示，工作流程如下：
+1. 构造输入
+ - 输入1是句子序列，采用one-hot方式表示
+ - 输入2是大写标记序列，标记了句子中每一个词是否是大写，采用one-hot方式表示；
+2. one-hot方式的句子序列和大写标记序列通过词表，转换为实向量表示的词向量序列；
+3. 将步骤2中的2个词向量序列作为双向RNN的输入，学习输入序列的特征表示，得到新的特性表示序列；
+4. CRF以步骤3中模型学习到的特征为输入，以标记序列为监督信号，实现序列标注。
+<div  align="center">  
+<img src="images/ner_network.png" width = "40%"  align=center /><br>
+图2. NER 模型网络结构图
+</div>
+## 数据说明
+在本例中，我们以 [CoNLL 2003 NER任务](http://www.clips.uantwerpen.be/conll2003/ner/)为例，原始Reuters数据由于版权原因需另外申请免费下载，请大家按照原网站说明获取。
+ 我们仅在`data`目录下的`train`和`test`文件中放置少数样本用以示例输入数据格式。
+ 本例依赖数据还包括
+    1. 输入文本的词典
+    2. 为词典中的词语提供预训练好的词向量
+    2. 标记标签的词典
+   标记标签词典已附在`data`目录中，对应于`data/target.txt`文件。输入文本的词典以及词典中词语的预训练的词向量来自：[Stanford CS224d](http://cs224d.stanford.edu/)课程作业。**为运行本例，请首先在`data`目录下运行`download.sh`脚本下载输入文本的词典和预训练的词向量。** 完成后会将这两个文件一并放入`data`目录下，输入文本的词典和预训练的词向量分别对应：`data/vocab.txt`和`data/wordVectors.txt`这两个文件。
+CoNLL 2003原始数据格式如下：
+```
+U.N.         NNP  I-NP  I-ORG
+official     NN   I-NP  O
+Ekeus        NNP  I-NP  I-PER
+heads        VBZ  I-VP  O
+for          IN   I-PP  O
+Baghdad      NNP  I-NP  I-LOC
+.            .    O     O
+```
+- 第一列为原始句子序列
+- 第二、三列分别为词性标签和句法分析中的语块标签，本例不使用
+- 第四列为采用了 I-TYPE 方式表示的NER标签
+    - I-TYPE 和 BIO 方式的主要区别在于语块开始标记的使用上，I-TYPE只有在出现相邻的同类别实体时对后者使用B标记，其他均使用I标记），句子之间以空行分隔。
+我们在`reader.py`脚本中完成对原始数据的处理以及读取，主要包括下面几个步骤:
+1. 从原始数据文件中抽取出句子和标签，构造句子序列和标签序列；
+2. 将 I-TYPE 表示的标签转换为 BIO 方式表示的标签；
+3. 将句子序列中的单词转换为小写，并构造大写标记序列；
+4. 依据词典获取词对应的整数索引。
+预处理完成后，一条训练样本包含3个部分作为神经网络的输入信息用于训练：（1）句子序列；（2）首字母大写标记序列；（3）标注序列，下表是一条训练样本的示例：
+| 句子序列 | 大写标记序列 | 标注序列 |
+|---|---|---|
+| u.n. | 1 | B-ORG |
+| official | 0 | O |
+| ekeus | 1 | B-PER |
+| heads | 0 | O |
+| for | 0 | O |
+| baghdad | 1 | B-LOC |
+| . | 0 | O |
+## 运行
+### 编写数据读取接口
+自定义数据读取接口只需编写一个 Python 生成器实现从原始输入文本中解析一条训练样本的逻辑。[reader.py](./reader.py) 中的`data_reader`函数实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`的 3 个输入（分别对应：词语在字典的序号、是否为大写、标注结果在字典中的序号）给`network_conf.ner_net`中定义的 3 个 `data_layer` 的功能。
+### 训练
+1. 运行 `sh data/download.sh`
+2. 修改 `train.py` 的 `main` 函数，指定数据路径
+    ```python
+    main(
+        train_data_file='data/train',
+        test_data_file='data/test',
+        vocab_file='data/vocab.txt',
+        target_file='data/target.txt',
+        emb_file='data/wordVectors.txt')
+    ```
+3. 运行命令 `python train.py` ，**需要注意：直接运行使用的是示例数据，请替换真实的标记数据。**
+    ```text
+    commandline:  --use_gpu=False --trainer_count=1
+    Initing parameters..
+    Init parameters done.
+    Pass 0, Batch 0, Cost 41.430110, {'ner_chunk.precision': 0.01587301678955555, 'ner_chunk.F1-score': 0.028368793427944183, 'ner_chunk.recall': 0.13333334028720856, 'error': 0.939393937587738}
+    Test with Pass 0, Batch 0, {'ner_chunk.precision': 0.0, 'ner_chunk.F1-score': 0.0, 'ner_chunk.recall': 0.0, 'error': 0.16260161995887756}
+    ```
+### 预测
+1. 修改 [infer.py](./infer.py) 的 `main` 函数，指定：需要测试的模型的路径、测试数据、字典文件，预测标记文件的路径，默认参数如下：
+    ```python
+    infer(
+        model_path="models/params_pass_0.tar.gz",
+        batch_size=2,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt")
+    ```
+2. 在终端运行 `python infer.py`，开始测试，会看到如下预测结果（以下为训练500个pass所得模型的部分预测结果）：
+    ```text
+    cricket             O
+    -                   O
+    leicestershire      B-ORG
+    take                O
+    over                O
+    at                  O
+    top                 O
+    after               O
+    innings             O
+    victory             O
+    .                   O
+    london              B-LOC
+    1996-08-30          O
+    west                B-MISC
+    indian              I-MISC
+    all-rounder         O
+    phil                B-PER
+    simmons             I-PER
+    took                O
+    four                O
+    ```
+    输出分为两列，以“\t” 分隔，第一列是输入的词语，第二列是标记结果。多条输入序列之间以空行分隔。
+## 参考文献
+1. Graves A. [Supervised Sequence Labelling with Recurrent Neural Networks](http://www.cs.toronto.edu/~graves/preprint.pdf)[J]. Studies in Computational Intelligence, 2013, 385.
+2. Collobert R, Weston J, Bottou L, et al. [Natural Language Processing (Almost) from Scratch](http://www.jmlr.org/papers/volume12/collobert11a/collobert11a.pdf)[J]. Journal of Machine Learning Research, 2011, 12(1):2493-2537.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/sequence_tagging_for_ner/infer.py
+++ b/sequence_tagging_for_ner/infer.py
+import gzip
+import reader
+import paddle.v2 as paddle
+from network_conf import ner_net
+from utils import load_dict, load_reverse_dict
+def infer(model_path, batch_size, test_data_file, vocab_file, target_file):
+    def _infer_a_batch(inferer, test_data, id_2_word, id_2_label):
+        probs = inferer.infer(input=test_data, field=["id"])
+        assert len(probs) == sum(len(x[0]) for x in test_data)
+        for idx, test_sample in enumerate(test_data):
+            start_id = 0
+            for w, tag in zip(test_sample[0],
+                              probs[start_id:start_id + len(test_sample[0])]):
+                print("%s\t%s" % (id_2_word[w], id_2_label[tag]))
+            print("\n")
+            start_id += len(test_sample[0])
+    word_dict = load_dict(vocab_file)
+    word_dict_len = len(word_dict)
+    word_reverse_dict = load_reverse_dict(vocab_file)
+    label_dict = load_dict(target_file)
+    label_reverse_dict = load_reverse_dict(target_file)
+    label_dict_len = len(label_dict)
+    # initialize PaddlePaddle
+    paddle.init(use_gpu=False, trainer_count=1)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+    predict = ner_net(
+        word_dict_len=word_dict_len,
+        label_dict_len=label_dict_len,
+        is_train=False)
+    inferer = paddle.inference.Inference(
+        output_layer=predict, parameters=parameters)
+    test_data = []
+    for i, item in enumerate(
+            reader.data_reader(test_data_file, word_dict, label_dict)()):
+        test_data.append([item[0], item[1]])
+        if len(test_data) == batch_size:
+            _infer_a_batch(inferer, test_data, word_reverse_dict,
+                           label_reverse_dict)
+            test_data = []
+    _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict)
+    test_data = []
+if __name__ == "__main__":
+    infer(
+        model_path="models/params_pass_0.tar.gz",
+        batch_size=2,
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt")
--- a/sequence_tagging_for_ner/network_conf.py
+++ b/sequence_tagging_for_ner/network_conf.py
+import math
+import paddle.v2 as paddle
+import paddle.v2.evaluator as evaluator
+def ner_net(word_dict_len, label_dict_len, stack_num=2, is_train=True):
+    mark_dict_len = 2
+    word_dim = 50
+    mark_dim = 5
+    hidden_dim = 128
+    word = paddle.layer.data(
+        name='word',
+        type=paddle.data_type.integer_value_sequence(word_dict_len))
+    word_embedding = paddle.layer.embedding(
+        input=word,
+        size=word_dim,
+        param_attr=paddle.attr.Param(
+            name='emb', initial_std=math.sqrt(1. / word_dim), is_static=True))
+    mark = paddle.layer.data(
+        name='mark',
+        type=paddle.data_type.integer_value_sequence(mark_dict_len))
+    mark_embedding = paddle.layer.embedding(
+        input=mark,
+        size=mark_dim,
+        param_attr=paddle.attr.Param(initial_std=math.sqrt(1. / word_dim)))
+    word_caps_vector = paddle.layer.concat(
+        input=[word_embedding, mark_embedding])
+    mix_hidden_lr = 1e-3
+    rnn_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=0.1)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=1 / math.sqrt(hidden_dim), learning_rate=mix_hidden_lr)
+    # the first rnn layer shares the input-to-hidden mappings.
+    hidden = paddle.layer.fc(
+        name="__hidden00__",
+        size=hidden_dim,
+        act=paddle.activation.Tanh(),
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        input=word_caps_vector,
+        param_attr=hidden_para_attr)
+    fea = []
+    for direction in ["fwd", "bwd"]:
+        for i in range(stack_num):
+            if i:
+                hidden = paddle.layer.fc(
+                    name="__hidden%02d_%s__" % (i, direction),
+                    size=hidden_dim,
+                    act=paddle.activation.STanh(),
+                    bias_attr=paddle.attr.Param(initial_std=1.),
+                    input=[hidden, rnn],
+                    param_attr=[hidden_para_attr, rnn_para_attr])
+            rnn = paddle.layer.recurrent(
+                name="__rnn%02d_%s__" % (i, direction),
+                input=hidden,
+                act=paddle.activation.Relu(),
+                bias_attr=paddle.attr.Param(initial_std=1.),
+                reverse=i % 2 if direction == "fwd" else not i % 2,
+                param_attr=rnn_para_attr)
+        fea += [hidden, rnn]
+    rnn_fea = paddle.layer.fc(
+        size=hidden_dim,
+        bias_attr=paddle.attr.Param(initial_std=1.),
+        act=paddle.activation.STanh(),
+        input=fea,
+        param_attr=[hidden_para_attr, rnn_para_attr] * 2)
+    emission = paddle.layer.fc(
+        size=label_dict_len,
+        bias_attr=False,
+        input=rnn_fea,
+        param_attr=rnn_para_attr)
+    if is_train:
+        target = paddle.layer.data(
+            name='target',
+            type=paddle.data_type.integer_value_sequence(label_dict_len))
+        crf = paddle.layer.crf(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw', initial_std=1e-3))
+        crf_dec = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            label=target,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return crf, crf_dec, target
+    else:
+        predict = paddle.layer.crf_decoding(
+            size=label_dict_len,
+            input=emission,
+            param_attr=paddle.attr.Param(name='crfw'))
+        return predict
--- a/sequence_tagging_for_ner/reader.py
+++ b/sequence_tagging_for_ner/reader.py
+"""
+Conll03 dataset.
+"""
+from utils import *
+__all__ = ["data_reader"]
+def canonicalize_digits(word):
+    if any([c.isalpha() for c in word]): return word
+    word = re.sub("\d", "DG", word)
+    if word.startswith("DG"):
+        word = word.replace(",", "")  # remove thousands separator
+    return word
+def canonicalize_word(word, wordset=None, digits=True):
+    word = word.lower()
+    if digits:
+        if (wordset != None) and (word in wordset): return word
+        word = canonicalize_digits(word)  # try to canonicalize numbers
+    if (wordset == None) or (word in wordset): return word
+    else: return "<UNK>"  # unknown token
+def data_reader(data_file, word_dict, label_dict):
+    """
+    The dataset can be obtained according to http://www.clips.uantwerpen.be/conll2003/ner/.
+    It returns a reader creator, each sample in the reader includes:
+    word id sequence, label id sequence and raw sentence.
+    :return: reader creator
+    :rtype: callable
+    """
+    def reader():
+        UNK_IDX = word_dict["<UNK>"]
+        sentence = []
+        labels = []
+        with open(data_file, "r") as f:
+            for line in f:
+                if len(line.strip()) == 0:
+                    if len(sentence) > 0:
+                        word_idx = [
+                            word_dict.get(
+                                canonicalize_word(w, word_dict), UNK_IDX)
+                            for w in sentence
+                        ]
+                        mark = [1 if w[0].isupper() else 0 for w in sentence]
+                        label_idx = [label_dict[l] for l in labels]
+                        yield word_idx, mark, label_idx
+                    sentence = []
+                    labels = []
+                else:
+                    segs = line.strip().split()
+                    sentence.append(segs[0])
+                    # transform I-TYPE to BIO schema
+                    if segs[-1] != "O" and (len(labels) == 0 or
+                                            labels[-1][1:] != segs[-1][1:]):
+                        labels.append("B" + segs[-1][1:])
+                    else:
+                        labels.append(segs[-1])
+    return reader
--- a/sequence_tagging_for_ner/train.py
+++ b/sequence_tagging_for_ner/train.py
+import gzip
+import numpy as np
+import reader
+from utils import logger, load_dict, get_embedding
+from network_conf import ner_net
+import paddle.v2 as paddle
+import paddle.v2.evaluator as evaluator
+def main(train_data_file,
+         test_data_file,
+         vocab_file,
+         target_file,
+         emb_file,
+         model_save_dir,
+         num_passes=10,
+         batch_size=32):
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+    word_dict = load_dict(vocab_file)
+    label_dict = load_dict(target_file)
+    word_vector_values = get_embedding(emb_file)
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    paddle.init(use_gpu=False, trainer_count=1)
+    # define network topology
+    crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len)
+    evaluator.sum(name="error", input=crf_dec)
+    evaluator.chunk(
+        name="ner_chunk",
+        input=crf_dec,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=(label_dict_len - 1) / 2)
+    # create parameters
+    parameters = paddle.parameters.create(crf_cost)
+    parameters.set("emb", word_vector_values)
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-4,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        gradient_clipping_threshold=25,
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+    trainer = paddle.trainer.SGD(
+        cost=crf_cost,
+        parameters=parameters,
+        update_equation=optimizer,
+        extra_layers=crf_dec)
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(train_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            reader.data_reader(test_data_file, word_dict, label_dict),
+            buf_size=1000),
+        batch_size=batch_size)
+    feeding = {"word": 0, "mark": 1, "target": 2}
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+            if event.batch_id % 1 == 0:
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                logger.info("\nTest with Pass %d, Batch %d, %s" %
+                            (event.pass_id, event.batch_id, result.metrics))
+        if isinstance(event, paddle.event.EndPass):
+            # save parameters
+            with gzip.open(
+                    os.path.join(model_save_dir, "params_pass_%d.tar.gz" %
+                                 event.pass_id), "w") as f:
+                parameters.to_tar(f)
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            logger.info("\nTest with Pass %d, %s" % (event.pass_id,
+                                                     result.metrics))
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=num_passes,
+        feeding=feeding)
+if __name__ == "__main__":
+    main(
+        train_data_file="data/train",
+        test_data_file="data/test",
+        vocab_file="data/vocab.txt",
+        target_file="data/target.txt",
+        emb_file="data/wordVectors.txt")
--- a/sequence_tagging_for_ner/utils.py
+++ b/sequence_tagging_for_ner/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import re
+import argparse
+import numpy as np
+from collections import defaultdict
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def get_embedding(emb_file='data/wordVectors.txt'):
+    """
+    Get the trained word vector.
+    """
+    return np.loadtxt(emb_file, dtype=float)
+def load_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+    This function takes the first column (columns in a line are seperated by
+    tab) as key and takes line number of a line as the key (index of the word
+    in the dictionary).
+    """
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+def load_reverse_dict(dict_path):
+    """
+    Load the word dictionary from the given file.
+    Each line of the given file is a word, which can include multiple columns
+    seperated by tab.
+    This function takes line number of a line as the key (index of the word in
+    the dictionary) and the first column (columns in a line are seperated by
+    tab) as the value.
+    """
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
--- a/text_classification/.gitignore
+++ b/text_classification/.gitignore
+data
+*.tar.gz
+*.log
+*.pyc
--- a/text_classification/README.md
+++ b/text_classification/README.md
-TBD
+# 文本分类
+以下是本例目录包含的文件以及对应说明:
+```text
+.
+├── images              # 文档中的图片
+│   ├── cnn_net.png
+│   └── dnn_net.png
+├── index.html          # 文档
+├── infer.py            # 预测脚本
+├── network_conf.py     # 本例中涉及的各种网络结构均定义在此文件中，若进一步修改模型结构，请查看此文件
+├── reader.py           # 读取数据接口，若使用自定义格式的数据，请查看此文件
+├── README.md           # 文档
+├── run.sh              # 训练任务运行脚本，直接运行此脚本，将以默认参数开始训练任务
+├── train.py            # 训练脚本
+└── utils.py            # 定义通用的函数，例如：打印日志、解析命令行参数、构建字典、加载字典等
+```
+## 简介
+文本分类任务根据给定一条文本的内容，判断该文本所属的类别，是自然语言处理领域的一项重要的基础任务。[PaddleBook](https://github.com/PaddlePaddle/book) 中的[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一课，正是一个典型的文本分类任务，任务流程如下：
+1. 收集电影评论网站的用户评论数据。
+2. 清洗，标记。
+3. 模型设计。
+4. 模型学习效果评估。
+训练好的分类器能够**自动判断**新出现的用户评论的情感是正面还是负面，在舆情监控、营销策划、产品品牌价值评估等任务中，能够起到重要作用。以上过程也是我们去完成一个新的文本分类任务需要遵循的常规流程。可以看到，深度学习方法的巨大优势体现在：**免除复杂的特征的设计，只需要对原始文本进行基础的清理、标注即可**。
+[PaddleBook](https://github.com/PaddlePaddle/book) 中的[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)介绍了一个较为复杂的栈式双向 LSTM 模型，循环神经网络在一些需要理解语言语义的复杂任务中有着明显的优势，但计算量大，通常对调参技巧也有着更高的要求。在对计算时间有一定限制的任务中，也会考虑其它模型。除了计算时间的考量，更重要的一点：**模型选择往往是机器学习任务成功的基础**。机器学习任务的目标始终是提高泛化能力，也就是对未知的新的样本预测的能力：
+1. 简单模型拟合能力不足，无法精确拟合训练样本，更加无法期待模型能够准确地预测没有出现在训练样本集中的未知样本，这就是**欠拟合**问题。
+2. 然而，过于复杂的模型轻松“记忆”了训练样本集中的每一个样本，但对于没有出现在训练样本集中的未知样本却毫无识别能力，这就是**过拟合**问题。
+"No Free Lunch (NFL)" 是机器学习任务基本原则之一：没有任何一种模型是天生优于其他模型的。模型的设计和选择建立在了解不同模型特性的基础之上，但同时也是一个多次实验评估的过程。在本例中，我们继续向大家介绍几种最常用的文本分类模型，它们的能力和复杂程度不同，帮助大家对比学习这些模型学习效果之间的差异，针对不同的场景选择使用。
+## 模型详解
+`network_conf.py` 中包括以下模型：
+1. `fc_net`： DNN 模型，是一个非序列模型。使用基本的全连接结构。
+2. `convolution_net`：浅层 CNN 模型，是一个基础的序列模型，能够处理变长的序列输入，提取一个局部区域之内的特征。
+我们以情感分类任务为例，简单说明序列模型和非序列模型之间的差异。情感分类是一项常见的文本分类任务，模型自动判断文本中表现出的情感是正向还是负向。以句子 "The apple is not bad" 为例，"not bad" 是决定这个句子情感的关键：
+- 对于 DNN 模型来说，只知道句子中有一个 "not" 和一个 "bad"，两者之间的顺序关系在输入网络时已丢失，网络不再有机会学习序列之间的顺序信息。
+- CNN 模型接受文本序列作为输入，保留了 "not bad" 之间的顺序信息。
+两者各自的一些特点简单总结如下：
+1. DNN 的计算量可以远低于 CNN / RNN 模型，在对响应时间有要求的任务中具有优势。
+2. DNN 刻画的往往是频繁词特征，潜在会受到分词错误的影响，但对一些依赖关键词特征也能做的不错的任务：如 Spam 短信检测，依然是一个有效的模型。
+3. 在大多数需要一定语义理解（例如，借助上下文消除语义中的歧义）的文本分类任务上，以 CNN / RNN 为代表的序列模型的效果往往好于 DNN 模型。
+### 1. DNN 模型
+**DNN 模型结构入下图所示：**
+<p align="center">
+<img src="images/dnn_net.png" width = "90%" align="center"/><br/>
+图1. 本例中的 DNN 文本分类模型
+</p>
+在 PaddlePaddle 实现该 DNN 结构的代码见 `network_conf.py` 中的 `fc_net` 函数，模型主要分为如下几个部分：
+- **词向量层**：为了更好地表示不同词之间语义上的关系，首先将词语转化为固定维度的向量。训练完成后，词与词语义上的相似程度可以用它们的词向量之间的距离来表示，语义上越相似，距离越近。关于词向量的更多信息请参考PaddleBook中的[词向量](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)一节。
+- **最大池化层**：最大池化在时间序列上进行，池化过程消除了不同语料样本在单词数量多少上的差异，并提炼出词向量中每一下标位置上的最大值。经过池化后，词向量层输出的向量序列被转化为一条固定维度的向量。例如，假设最大池化前向量的序列为`[[2,3,5],[7,3,6],[1,4,0]]`，则最大池化的结果为：`[7,4,6]`。
+- **全连接隐层**：经过最大池化后的向量被送入两个连续的隐层，隐层之间为全连接结构。
+- **输出层**：输出层的神经元数量和样本的类别数一致，例如在二分类问题中，输出层会有2个神经元。通过Softmax激活函数，输出结果是一个归一化的概率分布，和为1，因此第$i$个神经元的输出就可以认为是样本属于第$i$类的预测概率。
+该 DNN 模型默认对输入的语料进行二分类（`class_dim=2`），embedding（词向量）维度默认为28（`emd_dim=28`），两个隐层均使用Tanh激活函数（`act=paddle.activation.Tanh()`）。需要注意的是，该模型的输入数据为整数序列，而不是原始的单词序列。事实上，为了处理方便，我们一般会事先将单词根据词频顺序进行 id 化，即将词语转化成在字典中的序号。
+### 2. CNN 模型
+**CNN 模型结构如下图所示：**
+<p align="center">
+<img src="images/cnn_net.png" width = "90%" align="center"/><br/>
+图2. 本例中的 CNN 文本分类模型
+</p>
+通过 PaddlePaddle 实现该 CNN 结构的代码见 `network_conf.py` 中的 `convolution_net` 函数，模型主要分为如下几个部分:
+- **词向量层**：与 DNN 中词向量层的作用一样，将词语转化为固定维度的向量，利用向量之间的距离来表示词之间的语义相关程度。如图2所示，将得到的词向量定义为行向量，再将语料中所有的单词产生的行向量拼接在一起组成矩阵。假设词向量维度为5，句子 “The cat sat on the read mat” 含 7 个词语，那么得到的矩阵维度为 7*5。关于词向量的更多信息请参考 PaddleBook 中的[词向量](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)一节。
+- **卷积层**： 文本分类中的卷积在时间序列上进行，即卷积核的宽度和词向量层产出的矩阵一致，卷积沿着矩阵的高度方向进行。卷积后得到的结果被称为“特征图”（feature map）。假设卷积核的高度为 $h$，矩阵的高度为 $N$，卷积的步长为 1，则得到的特征图为一个高度为 $N+1-h$ 的向量。可以同时使用多个不同高度的卷积核，得到多个特征图。
+- **最大池化层**: 对卷积得到的各个特征图分别进行最大池化操作。由于特征图本身已经是向量，因此这里的最大池化实际上就是简单地选出各个向量中的最大元素。各个最大元素又被拼接在一起，组成新的向量，显然，该向量的维度等于特征图的数量，也就是卷积核的数量。举例来说，假设我们使用了四个不同的卷积核，卷积产生的特征图分别为：`[2,3,5]`、`[8,2,1]`、`[5,7,7,6]` 和 `[4,5,1,8]`，由于卷积核的高度不同，因此产生的特征图尺寸也有所差异。分别在这四个特征图上进行最大池化，结果为：`[5]`、`[8]`、`[7]`和`[8]`，最后将池化结果拼接在一起，得到`[5,8,7,8]`。
+- **全连接与输出层**：将最大池化的结果通过全连接层输出，与 DNN 模型一样，最后输出层的神经元个数与样本的类别数量一致，且输出之和为 1。
+CNN 网络的输入数据类型和 DNN 一致。PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块：`paddle.networks.sequence_conv_pool`，可直接调用。该模块的 `context_len` 参数用于指定卷积核在同一时间覆盖的文本长度，即图 2 中的卷积核的高度。`hidden_size` 用于指定该类型的卷积核的数量。本例代码默认使用了 128 个大小为 3 的卷积核和 128 个大小为 4 的卷积核，这些卷积的结果经过最大池化和结果拼接后产生一个 256 维的向量，向量经过一个全连接层输出最终的预测结果。
+## 使用 PaddlePaddle 内置数据运行
+### 如何训练
+在终端中执行 `sh run.sh` 以下命令， 将以 PaddlePaddle 内置的情感分类数据集：`paddle.dataset.imdb` 直接运行本例，会看到如下输入：
+```text
+Pass 0, Batch 0, Cost 0.696031, {'__auc_evaluator_0__': 0.47360000014305115, 'classification_error_evaluator': 0.5}
+Pass 0, Batch 100, Cost 0.544438, {'__auc_evaluator_0__': 0.839249312877655, 'classification_error_evaluator': 0.30000001192092896}
+Pass 0, Batch 200, Cost 0.406581, {'__auc_evaluator_0__': 0.9030032753944397, 'classification_error_evaluator': 0.2199999988079071}
+Test at Pass 0, {'__auc_evaluator_0__': 0.9289745092391968, 'classification_error_evaluator': 0.14927999675273895}
+```
+日志每隔 100 个 batch 输出一次，输出信息包括：（1）Pass 序号；（2）Batch 序号；（3）依次输出当前 Batch 上评估指标的评估结果。评估指标在配置网络拓扑结构时指定，在上面的输出中，输出了训练样本集之的 AUC 以及错误率指标。
+### 如何预测
+训练结束后模型默认存储在当前工作目录下，在终端中执行 `python infer.py` ，预测脚本会加载训练好的模型进行预测。
+- 默认加载使用 `paddle.data.imdb.train` 训练一个 Pass 产出的 DNN 模型对 `paddle.dataset.imdb.test` 进行测试
+会看到如下输出：
+```text
+positive        0.9275 0.0725   previous reviewer <unk> <unk> gave a much better <unk> of the films plot details than i could what i recall mostly is that it was just so beautiful in every sense emotionally visually <unk> just <unk> br if you like movies that are wonderful to look at and also have emotional content to which that beauty is relevant i think you will be glad to have seen this extraordinary and unusual work of <unk> br on a scale of 1 to 10 id give it about an <unk> the only reason i shy away from 9 is that it is a mood piece if you are in the mood for a really artistic very romantic film then its a 10 i definitely think its a mustsee but none of us can be in that mood all the time so overall <unk>
+negative        0.0300 0.9700   i love scifi and am willing to put up with a lot scifi <unk> are usually <unk> <unk> and <unk> i tried to like this i really did but it is to good tv scifi as <unk> 5 is to star trek the original silly <unk> cheap cardboard sets stilted dialogues cg that doesnt match the background and painfully onedimensional characters cannot be overcome with a scifi setting im sure there are those of you out there who think <unk> 5 is good scifi tv its not its clichéd and <unk> while us viewers might like emotion and character development scifi is a genre that does not take itself seriously <unk> star trek it may treat important issues yet not as a serious philosophy its really difficult to care about the characters here as they are not simply <unk> just missing a <unk> of life their actions and reactions are wooden and predictable often painful to watch the makers of earth know its rubbish as they have to always say gene <unk> earth otherwise people would not continue watching <unk> <unk> must be turning in their <unk> as this dull cheap poorly edited watching it without <unk> breaks really brings this home <unk> <unk> of a show <unk> into space spoiler so kill off a main character and then bring him back as another actor <unk> <unk> all over again
+```
+输出日志每一行是对一条样本预测的结果，以 `\t` 分隔，共 3 列，分别是：（1）预测类别标签；（2）样本分别属于每一类的概率，内部以空格分隔；（3）输入文本。
+## 使用自定义数据训练和预测
+### 如何训练
+1. 数据组织
+	假设有如下格式的训练数据：每一行为一条样本，以 `\t` 分隔，第一列是类别标签，第二列是输入文本的内容，文本内容中的词语以空格分隔。以下是两条示例数据：
+	```
+	positive        PaddlePaddle is good
+	negative        What a terrible weather
+	```
+2. 编写数据读取接口
+	自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`（词语在字典的序号）和 `paddle.data_type.integer_value`（类别标签）的 2 个输入给网络中定义的 2 个 `data_layer` 的功能。
+	```python
+	def train_reader(data_dir, word_dict, label_dict):
+	    def reader():
+	        UNK_ID = word_dict["<UNK>"]
+	        word_col = 0
+	        lbl_col = 1
+	        for file_name in os.listdir(data_dir):
+	            with open(os.path.join(data_dir, file_name), "r") as f:
+	                for line in f:
+	                    line_split = line.strip().split("\t")
+	                    word_ids = [
+	                        word_dict.get(w, UNK_ID)
+	                        for w in line_split[word_col].split()
+	                    ]
+	                    yield word_ids, label_dict[line_split[lbl_col]]
+	    return reader
+	```
+	- 关于 PaddlePaddle 中 `data_layer` 接受输入数据的类型，以及数据读取接口对应该返回数据的格式，请参考 [input-types](http://www.paddlepaddle.org/release_doc/0.9.0/doc_cn/ui/data_provider/pydataprovider2.html#input-types) 一节。
+	- 以上代码片段详见本例目录下的 `reader.py` 脚本，`reader.py` 同时提供了读取测试数据的全部代码。
+	接下来，只需要将数据读取函数 `train_reader` 作为参数传递给 `train.py` 脚本中的 `paddle.batch` 接口即可使用自定义数据接口读取数据，调用方式如下：
+	```python
+	train_reader = paddle.batch(
+	        paddle.reader.shuffle(
+	            reader.train_reader(train_data_dir, word_dict, lbl_dict),
+	            buf_size=1000),
+	        batch_size=batch_size)
+	```
+3. 修改命令行参数
+	- 如果将数据组织成示例数据的同样的格式，只需在 `run.sh` 脚本中修改 `train.py` 启动参数，指定 `train_data_dir` 参数，可以直接运行本例，无需修改数据读取接口 `reader.py`。
+	- 执行 `python train.py --help` 可以获取`train.py` 脚本各项启动参数的详细说明，主要参数如下：
+		- `nn_type`：选择要使用的模型，目前支持两种：“dnn” 或者 “cnn”。
+		- `train_data_dir`：指定训练数据所在的文件夹，使用自定义数据训练，必须指定此参数，否则使用`paddle.dataset.imdb`训练，同时忽略`test_data_dir`，`word_dict`，和 `label_dict` 参数。  
+		- `test_data_dir`：指定测试数据所在的文件夹，若不指定将不进行测试。
+		- `word_dict`：字典文件所在的路径，若不指定，将从训练数据根据词频统计，自动建立字典。
+		- `label_dict`：类别标签字典，用于将字符串类型的类别标签，映射为整数类型的序号。
+		- `batch_size`：指定多少条样本后进行一次神经网络的前向运行及反向更新。
+		- `num_passes`：指定训练多少个轮次。
+### 如何预测
+1. 修改 `infer.py` 中以下变量，指定使用的模型、指定测试数据。
+	```python
+	model_path = "dnn_params_pass_00000.tar.gz"  # 指定模型所在的路径
+	nn_type = "dnn"      # 指定测试使用的模型
+	test_dir = "./data/test"      # 指定测试文件所在的目录
+	word_dict = "./data/dict/word_dict.txt"     # 指定字典所在的路径
+	label_dict = "./data/dict/label_dict.txt"    # 指定类别标签字典的路径
+	```
+2. 在终端中执行 `python infer.py`。
--- a/text_classification/images/cnn_net.png
+++ b/text_classification/images/cnn_net.png
--- a/text_classification/images/dnn_net.png
+++ b/text_classification/images/dnn_net.png
--- a/text_classification/index.html
+++ b/text_classification/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 文本分类
+以下是本例目录包含的文件以及对应说明:
+```text
+.
+├── images              # 文档中的图片
+│   ├── cnn_net.png
+│   └── dnn_net.png
+├── index.html          # 文档
+├── infer.py            # 预测脚本
+├── network_conf.py     # 本例中涉及的各种网络结构均定义在此文件中，若进一步修改模型结构，请查看此文件
+├── reader.py           # 读取数据接口，若使用自定义格式的数据，请查看此文件
+├── README.md           # 文档
+├── run.sh              # 训练任务运行脚本，直接运行此脚本，将以默认参数开始训练任务
+├── train.py            # 训练脚本
+└── utils.py            # 定义通用的函数，例如：打印日志、解析命令行参数、构建字典、加载字典等
+```
+## 简介
+文本分类任务根据给定一条文本的内容，判断该文本所属的类别，是自然语言处理领域的一项重要的基础任务。[PaddleBook](https://github.com/PaddlePaddle/book) 中的[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一课，正是一个典型的文本分类任务，任务流程如下：
+1. 收集电影评论网站的用户评论数据。
+2. 清洗，标记。
+3. 模型设计。
+4. 模型学习效果评估。
+训练好的分类器能够**自动判断**新出现的用户评论的情感是正面还是负面，在舆情监控、营销策划、产品品牌价值评估等任务中，能够起到重要作用。以上过程也是我们去完成一个新的文本分类任务需要遵循的常规流程。可以看到，深度学习方法的巨大优势体现在：**免除复杂的特征的设计，只需要对原始文本进行基础的清理、标注即可**。
+[PaddleBook](https://github.com/PaddlePaddle/book) 中的[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)介绍了一个较为复杂的栈式双向 LSTM 模型，循环神经网络在一些需要理解语言语义的复杂任务中有着明显的优势，但计算量大，通常对调参技巧也有着更高的要求。在对计算时间有一定限制的任务中，也会考虑其它模型。除了计算时间的考量，更重要的一点：**模型选择往往是机器学习任务成功的基础**。机器学习任务的目标始终是提高泛化能力，也就是对未知的新的样本预测的能力：
+1. 简单模型拟合能力不足，无法精确拟合训练样本，更加无法期待模型能够准确地预测没有出现在训练样本集中的未知样本，这就是**欠拟合**问题。
+2. 然而，过于复杂的模型轻松“记忆”了训练样本集中的每一个样本，但对于没有出现在训练样本集中的未知样本却毫无识别能力，这就是**过拟合**问题。
+"No Free Lunch (NFL)" 是机器学习任务基本原则之一：没有任何一种模型是天生优于其他模型的。模型的设计和选择建立在了解不同模型特性的基础之上，但同时也是一个多次实验评估的过程。在本例中，我们继续向大家介绍几种最常用的文本分类模型，它们的能力和复杂程度不同，帮助大家对比学习这些模型学习效果之间的差异，针对不同的场景选择使用。
+## 模型详解
+`network_conf.py` 中包括以下模型：
+1. `fc_net`： DNN 模型，是一个非序列模型。使用基本的全连接结构。
+2. `convolution_net`：浅层 CNN 模型，是一个基础的序列模型，能够处理变长的序列输入，提取一个局部区域之内的特征。
+我们以情感分类任务为例，简单说明序列模型和非序列模型之间的差异。情感分类是一项常见的文本分类任务，模型自动判断文本中表现出的情感是正向还是负向。以句子 "The apple is not bad" 为例，"not bad" 是决定这个句子情感的关键：
+- 对于 DNN 模型来说，只知道句子中有一个 "not" 和一个 "bad"，两者之间的顺序关系在输入网络时已丢失，网络不再有机会学习序列之间的顺序信息。
+- CNN 模型接受文本序列作为输入，保留了 "not bad" 之间的顺序信息。
+两者各自的一些特点简单总结如下：
+1. DNN 的计算量可以远低于 CNN / RNN 模型，在对响应时间有要求的任务中具有优势。
+2. DNN 刻画的往往是频繁词特征，潜在会受到分词错误的影响，但对一些依赖关键词特征也能做的不错的任务：如 Spam 短信检测，依然是一个有效的模型。
+3. 在大多数需要一定语义理解（例如，借助上下文消除语义中的歧义）的文本分类任务上，以 CNN / RNN 为代表的序列模型的效果往往好于 DNN 模型。
+### 1. DNN 模型
+**DNN 模型结构入下图所示：**
+<p align="center">
+<img src="images/dnn_net.png" width = "90%" align="center"/><br/>
+图1. 本例中的 DNN 文本分类模型
+</p>
+在 PaddlePaddle 实现该 DNN 结构的代码见 `network_conf.py` 中的 `fc_net` 函数，模型主要分为如下几个部分：
+- **词向量层**：为了更好地表示不同词之间语义上的关系，首先将词语转化为固定维度的向量。训练完成后，词与词语义上的相似程度可以用它们的词向量之间的距离来表示，语义上越相似，距离越近。关于词向量的更多信息请参考PaddleBook中的[词向量](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)一节。
+- **最大池化层**：最大池化在时间序列上进行，池化过程消除了不同语料样本在单词数量多少上的差异，并提炼出词向量中每一下标位置上的最大值。经过池化后，词向量层输出的向量序列被转化为一条固定维度的向量。例如，假设最大池化前向量的序列为`[[2,3,5],[7,3,6],[1,4,0]]`，则最大池化的结果为：`[7,4,6]`。
+- **全连接隐层**：经过最大池化后的向量被送入两个连续的隐层，隐层之间为全连接结构。
+- **输出层**：输出层的神经元数量和样本的类别数一致，例如在二分类问题中，输出层会有2个神经元。通过Softmax激活函数，输出结果是一个归一化的概率分布，和为1，因此第$i$个神经元的输出就可以认为是样本属于第$i$类的预测概率。
+该 DNN 模型默认对输入的语料进行二分类（`class_dim=2`），embedding（词向量）维度默认为28（`emd_dim=28`），两个隐层均使用Tanh激活函数（`act=paddle.activation.Tanh()`）。需要注意的是，该模型的输入数据为整数序列，而不是原始的单词序列。事实上，为了处理方便，我们一般会事先将单词根据词频顺序进行 id 化，即将词语转化成在字典中的序号。
+### 2. CNN 模型
+**CNN 模型结构如下图所示：**
+<p align="center">
+<img src="images/cnn_net.png" width = "90%" align="center"/><br/>
+图2. 本例中的 CNN 文本分类模型
+</p>
+通过 PaddlePaddle 实现该 CNN 结构的代码见 `network_conf.py` 中的 `convolution_net` 函数，模型主要分为如下几个部分:
+- **词向量层**：与 DNN 中词向量层的作用一样，将词语转化为固定维度的向量，利用向量之间的距离来表示词之间的语义相关程度。如图2所示，将得到的词向量定义为行向量，再将语料中所有的单词产生的行向量拼接在一起组成矩阵。假设词向量维度为5，句子 “The cat sat on the read mat” 含 7 个词语，那么得到的矩阵维度为 7*5。关于词向量的更多信息请参考 PaddleBook 中的[词向量](https://github.com/PaddlePaddle/book/tree/develop/04.word2vec)一节。
+- **卷积层**： 文本分类中的卷积在时间序列上进行，即卷积核的宽度和词向量层产出的矩阵一致，卷积沿着矩阵的高度方向进行。卷积后得到的结果被称为“特征图”（feature map）。假设卷积核的高度为 $h$，矩阵的高度为 $N$，卷积的步长为 1，则得到的特征图为一个高度为 $N+1-h$ 的向量。可以同时使用多个不同高度的卷积核，得到多个特征图。
+- **最大池化层**: 对卷积得到的各个特征图分别进行最大池化操作。由于特征图本身已经是向量，因此这里的最大池化实际上就是简单地选出各个向量中的最大元素。各个最大元素又被拼接在一起，组成新的向量，显然，该向量的维度等于特征图的数量，也就是卷积核的数量。举例来说，假设我们使用了四个不同的卷积核，卷积产生的特征图分别为：`[2,3,5]`、`[8,2,1]`、`[5,7,7,6]` 和 `[4,5,1,8]`，由于卷积核的高度不同，因此产生的特征图尺寸也有所差异。分别在这四个特征图上进行最大池化，结果为：`[5]`、`[8]`、`[7]`和`[8]`，最后将池化结果拼接在一起，得到`[5,8,7,8]`。
+- **全连接与输出层**：将最大池化的结果通过全连接层输出，与 DNN 模型一样，最后输出层的神经元个数与样本的类别数量一致，且输出之和为 1。
+CNN 网络的输入数据类型和 DNN 一致。PaddlePaddle 中已经封装好的带有池化的文本序列卷积模块：`paddle.networks.sequence_conv_pool`，可直接调用。该模块的 `context_len` 参数用于指定卷积核在同一时间覆盖的文本长度，即图 2 中的卷积核的高度。`hidden_size` 用于指定该类型的卷积核的数量。本例代码默认使用了 128 个大小为 3 的卷积核和 128 个大小为 4 的卷积核，这些卷积的结果经过最大池化和结果拼接后产生一个 256 维的向量，向量经过一个全连接层输出最终的预测结果。
+## 使用 PaddlePaddle 内置数据运行
+### 如何训练
+在终端中执行 `sh run.sh` 以下命令， 将以 PaddlePaddle 内置的情感分类数据集：`paddle.dataset.imdb` 直接运行本例，会看到如下输入：
+```text
+Pass 0, Batch 0, Cost 0.696031, {'__auc_evaluator_0__': 0.47360000014305115, 'classification_error_evaluator': 0.5}
+Pass 0, Batch 100, Cost 0.544438, {'__auc_evaluator_0__': 0.839249312877655, 'classification_error_evaluator': 0.30000001192092896}
+Pass 0, Batch 200, Cost 0.406581, {'__auc_evaluator_0__': 0.9030032753944397, 'classification_error_evaluator': 0.2199999988079071}
+Test at Pass 0, {'__auc_evaluator_0__': 0.9289745092391968, 'classification_error_evaluator': 0.14927999675273895}
+```
+日志每隔 100 个 batch 输出一次，输出信息包括：（1）Pass 序号；（2）Batch 序号；（3）依次输出当前 Batch 上评估指标的评估结果。评估指标在配置网络拓扑结构时指定，在上面的输出中，输出了训练样本集之的 AUC 以及错误率指标。
+### 如何预测
+训练结束后模型默认存储在当前工作目录下，在终端中执行 `python infer.py` ，预测脚本会加载训练好的模型进行预测。
+- 默认加载使用 `paddle.data.imdb.train` 训练一个 Pass 产出的 DNN 模型对 `paddle.dataset.imdb.test` 进行测试
+会看到如下输出：
+```text
+positive        0.9275 0.0725   previous reviewer <unk> <unk> gave a much better <unk> of the films plot details than i could what i recall mostly is that it was just so beautiful in every sense emotionally visually <unk> just <unk> br if you like movies that are wonderful to look at and also have emotional content to which that beauty is relevant i think you will be glad to have seen this extraordinary and unusual work of <unk> br on a scale of 1 to 10 id give it about an <unk> the only reason i shy away from 9 is that it is a mood piece if you are in the mood for a really artistic very romantic film then its a 10 i definitely think its a mustsee but none of us can be in that mood all the time so overall <unk>
+negative        0.0300 0.9700   i love scifi and am willing to put up with a lot scifi <unk> are usually <unk> <unk> and <unk> i tried to like this i really did but it is to good tv scifi as <unk> 5 is to star trek the original silly <unk> cheap cardboard sets stilted dialogues cg that doesnt match the background and painfully onedimensional characters cannot be overcome with a scifi setting im sure there are those of you out there who think <unk> 5 is good scifi tv its not its clichéd and <unk> while us viewers might like emotion and character development scifi is a genre that does not take itself seriously <unk> star trek it may treat important issues yet not as a serious philosophy its really difficult to care about the characters here as they are not simply <unk> just missing a <unk> of life their actions and reactions are wooden and predictable often painful to watch the makers of earth know its rubbish as they have to always say gene <unk> earth otherwise people would not continue watching <unk> <unk> must be turning in their <unk> as this dull cheap poorly edited watching it without <unk> breaks really brings this home <unk> <unk> of a show <unk> into space spoiler so kill off a main character and then bring him back as another actor <unk> <unk> all over again
+```
+输出日志每一行是对一条样本预测的结果，以 `\t` 分隔，共 3 列，分别是：（1）预测类别标签；（2）样本分别属于每一类的概率，内部以空格分隔；（3）输入文本。
+## 使用自定义数据训练和预测
+### 如何训练
+1. 数据组织
+	假设有如下格式的训练数据：每一行为一条样本，以 `\t` 分隔，第一列是类别标签，第二列是输入文本的内容，文本内容中的词语以空格分隔。以下是两条示例数据：
+	```
+	positive        PaddlePaddle is good
+	negative        What a terrible weather
+	```
+2. 编写数据读取接口
+	自定义数据读取接口只需编写一个 Python 生成器实现**从原始输入文本中解析一条训练样本**的逻辑。以下代码片段实现了读取原始数据返回类型为： `paddle.data_type.integer_value_sequence`（词语在字典的序号）和 `paddle.data_type.integer_value`（类别标签）的 2 个输入给网络中定义的 2 个 `data_layer` 的功能。
+	```python
+	def train_reader(data_dir, word_dict, label_dict):
+	    def reader():
+	        UNK_ID = word_dict["<UNK>"]
+	        word_col = 0
+	        lbl_col = 1
+	        for file_name in os.listdir(data_dir):
+	            with open(os.path.join(data_dir, file_name), "r") as f:
+	                for line in f:
+	                    line_split = line.strip().split("\t")
+	                    word_ids = [
+	                        word_dict.get(w, UNK_ID)
+	                        for w in line_split[word_col].split()
+	                    ]
+	                    yield word_ids, label_dict[line_split[lbl_col]]
+	    return reader
+	```
+	- 关于 PaddlePaddle 中 `data_layer` 接受输入数据的类型，以及数据读取接口对应该返回数据的格式，请参考 [input-types](http://www.paddlepaddle.org/release_doc/0.9.0/doc_cn/ui/data_provider/pydataprovider2.html#input-types) 一节。
+	- 以上代码片段详见本例目录下的 `reader.py` 脚本，`reader.py` 同时提供了读取测试数据的全部代码。
+	接下来，只需要将数据读取函数 `train_reader` 作为参数传递给 `train.py` 脚本中的 `paddle.batch` 接口即可使用自定义数据接口读取数据，调用方式如下：
+	```python
+	train_reader = paddle.batch(
+	        paddle.reader.shuffle(
+	            reader.train_reader(train_data_dir, word_dict, lbl_dict),
+	            buf_size=1000),
+	        batch_size=batch_size)
+	```
+3. 修改命令行参数
+	- 如果将数据组织成示例数据的同样的格式，只需在 `run.sh` 脚本中修改 `train.py` 启动参数，指定 `train_data_dir` 参数，可以直接运行本例，无需修改数据读取接口 `reader.py`。
+	- 执行 `python train.py --help` 可以获取`train.py` 脚本各项启动参数的详细说明，主要参数如下：
+		- `nn_type`：选择要使用的模型，目前支持两种：“dnn” 或者 “cnn”。
+		- `train_data_dir`：指定训练数据所在的文件夹，使用自定义数据训练，必须指定此参数，否则使用`paddle.dataset.imdb`训练，同时忽略`test_data_dir`，`word_dict`，和 `label_dict` 参数。  
+		- `test_data_dir`：指定测试数据所在的文件夹，若不指定将不进行测试。
+		- `word_dict`：字典文件所在的路径，若不指定，将从训练数据根据词频统计，自动建立字典。
+		- `label_dict`：类别标签字典，用于将字符串类型的类别标签，映射为整数类型的序号。
+		- `batch_size`：指定多少条样本后进行一次神经网络的前向运行及反向更新。
+		- `num_passes`：指定训练多少个轮次。
+### 如何预测
+1. 修改 `infer.py` 中以下变量，指定使用的模型、指定测试数据。
+	```python
+	model_path = "dnn_params_pass_00000.tar.gz"  # 指定模型所在的路径
+	nn_type = "dnn"      # 指定测试使用的模型
+	test_dir = "./data/test"      # 指定测试文件所在的目录
+	word_dict = "./data/dict/word_dict.txt"     # 指定字典所在的路径
+	label_dict = "./data/dict/label_dict.txt"    # 指定类别标签字典的路径
+	```
+2. 在终端中执行 `python infer.py`。
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/text_classification/infer.py
+++ b/text_classification/infer.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import sys
+import os
+import gzip
+import paddle.v2 as paddle
+import reader
+from network_conf import fc_net, convolution_net
+from utils import logger, load_dict
+def infer(topology, data_dir, model_path, word_dict_path, label_dict_path,
+          batch_size):
+    def _infer_a_batch(inferer, test_batch, ids_2_word, ids_2_label):
+        probs = inferer.infer(input=test_batch, field=["value"])
+        assert len(probs) == len(test_batch)
+        for word_ids, prob in zip(test_batch, probs):
+            word_text = " ".join([ids_2_word[id] for id in word_ids[0]])
+            print("%s\t%s\t%s" % (ids_2_label[prob.argmax()],
+                                  " ".join(["{:0.4f}".format(p)
+                                            for p in prob]), word_text))
+    logger.info("begin to predict...")
+    use_default_data = (data_dir is None)
+    if use_default_data:
+        word_dict = paddle.dataset.imdb.word_dict()
+        word_reverse_dict = dict((value, key)
+                                 for key, value in word_dict.iteritems())
+        label_reverse_dict = {0: "positive", 1: "negative"}
+        test_reader = paddle.dataset.imdb.test(word_dict)
+    else:
+        assert os.path.exists(
+            word_dict_path), "the word dictionary file does not exist"
+        assert os.path.exists(
+            label_dict_path), "the label dictionary file does not exist"
+        word_dict = load_dict(word_dict_path)
+        word_reverse_dict = load_reverse_dict(word_dict_path)
+        label_reverse_dict = load_reverse_dict(label_dict_path)
+        test_reader = reader.test_reader(data_dir, word_dict)()
+    dict_dim = len(word_dict)
+    class_num = len(label_reverse_dict)
+    prob_layer = topology(dict_dim, class_num, is_infer=True)
+    # initialize PaddlePaddle
+    paddle.init(use_gpu=False, trainer_count=1)
+    # load the trained models
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+    inferer = paddle.inference.Inference(
+        output_layer=prob_layer, parameters=parameters)
+    test_batch = []
+    for idx, item in enumerate(test_reader):
+        test_batch.append([item[0]])
+        if len(test_batch) == batch_size:
+            _infer_a_batch(inferer, test_batch, word_reverse_dict,
+                           label_reverse_dict)
+            test_batch = []
+    if len(test_batch):
+        _infer_a_batch(inferer, test_batch, word_reverse_dict,
+                       label_reverse_dict)
+        test_batch = []
+if __name__ == "__main__":
+    model_path = "models/dnn_params_pass_00000.tar.gz"
+    assert os.path.exists(model_path), "the trained model does not exist."
+    nn_type = "dnn"
+    test_dir = None
+    word_dict = None
+    label_dict = None
+    if nn_type == "dnn":
+        topology = fc_net
+    elif nn_type == "cnn":
+        topology = convolution_net
+    infer(
+        topology=topology,
+        data_dir=test_dir,
+        word_dict_path=word_dict,
+        label_dict_path=label_dict,
+        model_path=model_path,
+        batch_size=10)
--- a/text_classification/network_conf.py
+++ b/text_classification/network_conf.py
+import sys
+import math
+import gzip
+from paddle.v2.layer import parse_network
+import paddle.v2 as paddle
+__all__ = ["fc_net", "convolution_net"]
+def fc_net(dict_dim,
+           class_num,
+           emb_dim=28,
+           hidden_layer_sizes=[28, 8],
+           is_infer=False):
+    """
+    define the topology of the dnn network
+    :param dict_dim: size of word dictionary
+    :type input_dim: int
+    :params class_num: number of instance class
+    :type class_num: int
+    :params emb_dim: embedding vector dimension
+    :type emb_dim: int
+    """
+    # define the input layers
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(dict_dim))
+    if not is_infer:
+        lbl = paddle.layer.data("label",
+                                paddle.data_type.integer_value(class_num))
+    # define the embedding layer
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    # max pooling to reduce the input sequence into a vector (non-sequence)
+    seq_pool = paddle.layer.pooling(
+        input=emb, pooling_type=paddle.pooling.Max())
+    for idx, hidden_size in enumerate(hidden_layer_sizes):
+        hidden_init_std = 1.0 / math.sqrt(hidden_size)
+        hidden = paddle.layer.fc(
+            input=hidden if idx else seq_pool,
+            size=hidden_size,
+            act=paddle.activation.Tanh(),
+            param_attr=paddle.attr.Param(initial_std=hidden_init_std))
+    prob = paddle.layer.fc(
+        input=hidden,
+        size=class_num,
+        act=paddle.activation.Softmax(),
+        param_attr=paddle.attr.Param(initial_std=1.0 / math.sqrt(class_num)))
+    if is_infer:
+        return prob
+    else:
+        return paddle.layer.classification_cost(
+            input=prob, label=lbl), prob, lbl
+def convolution_net(dict_dim,
+                    class_dim=2,
+                    emb_dim=28,
+                    hid_dim=128,
+                    is_infer=False):
+    """
+    cnn network definition
+    :param dict_dim: size of word dictionary
+    :type input_dim: int
+    :params class_dim: number of instance class
+    :type class_dim: int
+    :params emb_dim: embedding vector dimension
+    :type emb_dim: int
+    :params hid_dim: number of same size convolution kernels
+    :type hid_dim: int
+    """
+    # input layers
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(dict_dim))
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(class_dim))
+    # embedding layer
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    # convolution layers with max pooling
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    # fc and output layer
+    prob = paddle.layer.fc(
+        input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax())
+    if is_infer:
+        return prob
+    else:
+        cost = paddle.layer.classification_cost(input=prob, label=lbl)
+        return cost, prob, lbl
--- a/text_classification/reader.py
+++ b/text_classification/reader.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+def train_reader(data_dir, word_dict, label_dict):
+    """
+    Reader interface for training data
+    :param data_dir: data directory
+    :type data_dir: str
+    :param word_dict: path of word dictionary,
+        the dictionary must has a "UNK" in it.
+    :type word_dict: Python dict
+    :param label_dict: path of label dictionary
+    :type label_dict: Python dict
+    """
+    def reader():
+        UNK_ID = word_dict["<UNK>"]
+        word_col = 1
+        lbl_col = 0
+        for file_name in os.listdir(data_dir):
+            with open(os.path.join(data_dir, file_name), "r") as f:
+                for line in f:
+                    line_split = line.strip().split("\t")
+                    word_ids = [
+                        word_dict.get(w, UNK_ID)
+                        for w in line_split[word_col].split()
+                    ]
+                    yield word_ids, label_dict[line_split[lbl_col]]
+    return reader
+def test_reader(data_dir, word_dict):
+    """
+    Reader interface for testing data
+    :param data_dir: data directory.
+    :type data_dir: str
+    :param word_dict: path of word dictionary,
+        the dictionary must has a "UNK" in it.
+    :type word_dict: Python dict
+    """
+    def reader():
+        UNK_ID = word_dict["<UNK>"]
+        word_col = 1
+        for file_name in os.listdir(data_dir):
+            with open(os.path.join(data_dir, file_name), "r") as f:
+                for line in f:
+                    line_split = line.strip().split("\t")
+                    if len(line_split) < word_col: continue
+                    word_ids = [
+                        word_dict.get(w, UNK_ID)
+                        for w in line_split[word_col].split()
+                    ]
+                    yield word_ids, line_split[word_col]
+    return reader
--- a/text_classification/run.sh
+++ b/text_classification/run.sh
+#!/bin/sh
+python train.py \
+--nn_type="dnn" \
+--batch_size=64 \
+--num_passes=10 \
+2>&1 | tee train.log
--- a/text_classification/train.py
+++ b/text_classification/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import os
+import sys
+import gzip
+import paddle.v2 as paddle
+import reader
+from utils import logger, parse_train_cmd, build_dict, load_dict
+from network_conf import fc_net, convolution_net
+def train(topology,
+          train_data_dir=None,
+          test_data_dir=None,
+          word_dict_path=None,
+          label_dict_path=None,
+          model_save_dir="models",
+          batch_size=32,
+          num_passes=10):
+    """
+    train dnn model
+    :params train_data_path: path of training data, if this parameter
+        is not specified, paddle.dataset.imdb will be used to run this example
+    :type train_data_path: str
+    :params test_data_path: path of testing data, if this parameter
+        is not specified, paddle.dataset.imdb will be used to run this example
+    :type test_data_path: str
+    :params word_dict_path: path of training data, if this parameter
+        is not specified, paddle.dataset.imdb will be used to run this example
+    :type word_dict_path: str
+    :params num_pass: train pass number
+    :type num_pass: int
+    """
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+    use_default_data = (train_data_dir is None)
+    if use_default_data:
+        logger.info(("No training data are porivided, "
+                     "use paddle.dataset.imdb to train the model."))
+        logger.info("please wait to build the word dictionary ...")
+        word_dict = paddle.dataset.imdb.word_dict()
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+            batch_size=100)
+        test_reader = paddle.batch(
+            lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+        class_num = 2
+    else:
+        if word_dict_path is None or not os.path.exists(word_dict_path):
+            logger.info(("word dictionary is not given, the dictionary "
+                         "is automatically built from the training data."))
+            # build the word dictionary to map the original string-typed
+            # words into integer-typed index
+            build_dict(
+                data_dir=train_data_dir,
+                save_path=word_dict_path,
+                use_col=1,
+                cutoff_fre=5,
+                insert_extra_words=["<UNK>"])
+        if not os.path.exists(label_dict_path):
+            logger.info(("label dictionary is not given, the dictionary "
+                         "is automatically built from the training data."))
+            # build the label dictionary to map the original string-typed
+            # label into integer-typed index
+            build_dict(
+                data_dir=train_data_dir, save_path=label_dict_path, use_col=0)
+        word_dict = load_dict(word_dict_path)
+        lbl_dict = load_dict(label_dict_path)
+        class_num = len(lbl_dict)
+        logger.info("class number is : %d." % (len(lbl_dict)))
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                reader.train_reader(train_data_dir, word_dict, lbl_dict),
+                buf_size=1000),
+            batch_size=batch_size)
+        if test_data_dir is not None:
+            # here, because training and testing data share a same format,
+            # we still use the reader.train_reader to read the testing data.
+            test_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    reader.train_reader(test_data_dir, word_dict, lbl_dict),
+                    buf_size=1000),
+                batch_size=batch_size)
+        else:
+            test_reader = None
+    dict_dim = len(word_dict)
+    logger.info("length of word dictionary is : %d." % (dict_dim))
+    paddle.init(use_gpu=False, trainer_count=1)
+    # network config
+    cost, prob, label = topology(dict_dim, class_num)
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=1e-3),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+    # create trainer
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        extra_layers=paddle.evaluator.auc(input=prob, label=label),
+        parameters=parameters,
+        update_equation=adam_optimizer)
+    # begin training network
+    feeding = {"word": 0, "label": 1}
+    def _event_handler(event):
+        """
+        Define end batch and end pass event handler
+        """
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                logger.info("Pass %d, Batch %d, Cost %f, %s\n" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics))
+        if isinstance(event, paddle.event.EndPass):
+            if test_reader is not None:
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                logger.info("Test at Pass %d, %s \n" % (event.pass_id,
+                                                        result.metrics))
+            with gzip.open(
+                    os.path.join(model_save_dir, "dnn_params_pass_%05d.tar.gz" %
+                                 event.pass_id), "w") as f:
+                parameters.to_tar(f)
+    trainer.train(
+        reader=train_reader,
+        event_handler=_event_handler,
+        feeding=feeding,
+        num_passes=num_passes)
+    logger.info("Training has finished.")
+def main(args):
+    if args.nn_type == "dnn":
+        topology = fc_net
+    elif args.nn_type == "cnn":
+        topology = convolution_net
+    train(
+        topology=topology,
+        train_data_dir=args.train_data_dir,
+        test_data_dir=args.test_data_dir,
+        word_dict_path=args.word_dict,
+        label_dict_path=args.label_dict,
+        batch_size=args.batch_size,
+        num_passes=args.num_passes,
+        model_save_dir=args.model_save_dir)
+if __name__ == "__main__":
+    args = parse_train_cmd()
+    if args.train_data_dir is not None:
+        assert args.word_dict and args.label_dict, (
+            "the parameter train_data_dir, word_dict_path, and label_dict_path "
+            "should be set at the same time.")
+    main(args)
--- a/text_classification/utils.py
+++ b/text_classification/utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import logging
+import os
+import argparse
+from collections import defaultdict
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def parse_train_cmd():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle text classification demo")
+    parser.add_argument(
+        "--nn_type",
+        type=str,
+        help="define which type of network to use, available: [dnn, cnn]",
+        default="dnn")
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        required=False,
+        help=("path of training dataset (default: None). "
+              "if this parameter is not set, "
+              "paddle.dataset.imdb will be used."),
+        default=None)
+    parser.add_argument(
+        "--test_data_dir",
+        type=str,
+        required=False,
+        help=("path of testing dataset (default: None). "
+              "if this parameter is not set, "
+              "paddle.dataset.imdb will be used."),
+        default=None)
+    parser.add_argument(
+        "--word_dict",
+        type=str,
+        required=False,
+        help=("path of word dictionary (default: None)."
+              "if this parameter is not set, paddle.dataset.imdb will be used."
+              "if this parameter is set, but the file does not exist, "
+              "word dictionay will be built from "
+              "the training data automatically."),
+        default=None)
+    parser.add_argument(
+        "--label_dict",
+        type=str,
+        required=False,
+        help=("path of label dictionay (default: None)."
+              "if this parameter is not set, paddle.dataset.imdb will be used."
+              "if this parameter is set, but the file does not exist, "
+              "word dictionay will be built from "
+              "the training data automatically."),
+        default=None)
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="the number of training examples in one forward/backward pass")
+    parser.add_argument(
+        "--num_passes", type=int, default=10, help="number of passes to train")
+    parser.add_argument(
+        "--model_save_dir",
+        type=str,
+        required=False,
+        help=("path to save the trained models."),
+        default="models")
+    return parser.parse_args()
+def build_dict(data_dir,
+               save_path,
+               use_col=0,
+               cutoff_fre=0,
+               insert_extra_words=[]):
+    values = defaultdict(int)
+    for file_name in os.listdir(data_dir):
+        file_path = os.path.join(data_dir, file_name)
+        if not os.path.isfile(file_path):
+            continue
+        with open(file_path, "r") as fdata:
+            for line in fdata:
+                line_splits = line.strip().split("\t")
+                if len(line_splits) < use_col: continue
+                for w in line_splits[use_col].split():
+                    values[w] += 1
+    with open(save_path, "w") as f:
+        for w in insert_extra_words:
+            f.write("%s\t-1\n" % (w))
+        for v, count in sorted(
+                values.iteritems(), key=lambda x: x[1], reverse=True):
+            if count < cutoff_fre:
+                break
+            f.write("%s\t%d\n" % (v, count))
+def load_dict(dict_path):
+    return dict((line.strip().split("\t")[0], idx)
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
+def load_reverse_dict(dict_path):
+    return dict((idx, line.strip().split("\t")[0])
+                for idx, line in enumerate(open(dict_path, "r").readlines()))
--- a/word_embedding/README.md
+++ b/word_embedding/README.md
-TBD