Merge branch 'develop' into ctc_decoder_deploy

59b4b872 · Yibing Liu · 348d6bba · 0a27ca9d · 59b4b872 · 59b4b872
140 changed file
--- a/.gitignore
+++ b/.gitignore
+.DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: convert-markdown-into-html
+        name: convert-markdown-into-html
+        description: Convert README.md into index.html
+        entry: python .pre-commit-hooks/convert_markdown_into_html.py
+        language: system
+        files: .+README\.md$
--- a/.pre-commit-hooks/convert_markdown_into_html.py
+++ b/.pre-commit-hooks/convert_markdown_into_html.py
+import argparse
+import re
+import sys
+HEAD = """
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+"""
+TAIL = """
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
+"""
+def convert_markdown_into_html(argv=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filenames', nargs='*', help='Filenames to fix')
+    args = parser.parse_args(argv)
+    retv = 0
+    for filename in args.filenames:
+        with open(
+                re.sub(r"README", "index", re.sub(r"\.md$", ".html", filename)),
+                "w") as output:
+            output.write(HEAD)
+            with open(filename) as input:
+                for line in input:
+                    output.write(line)
+            output.write(TAIL)
+    return retv
+if __name__ == '__main__':
+    sys.exit(convert_markdown_into_html())
--- a/.travis.yml
+++ b/.travis.yml
+group: deprecated-2017Q2
 language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+services:
+  - docker
 os:
  - linux
 env:
@@ -16,8 +19,12 @@ addons:
      - python2.7-dev
 before_install:
  -  pip install -U virtualenv pre-commit pip
+  -  docker pull paddlepaddle/paddle:latest
 script:
  -  .travis/precommit.sh
+  -  docker run -i --rm -v "$PWD:/py_unittest" paddlepaddle/paddle:latest /bin/bash -c 
+    'cd /py_unittest; sh .travis/unittest.sh'
 notifications:
  email:
    on_success: change

--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
+#!/bin/bash
+abort(){
+    echo "Run unittest failed" 1>&2
+    echo "Please check your code" 1>&2
+    exit 1
+}
+unittest(){
+    cd $1 > /dev/null
+    if [ -f "setup.sh" ]; then
+        sh setup.sh
+    fi
+    if [ $? != 0 ]; then
+        exit 1
+    fi
+    find . -name 'tests' -type d -print0 | \
+        xargs -0 -I{} -n1 bash -c \
+        'python -m unittest discover -v -s {}'
+    cd - > /dev/null
+}
+trap 'abort' 0
+set -e
+for proj in */ ; do
+    if [ -d $proj ]; then
+        unittest $proj
+        if [ $? != 0 ]; then
+            exit 1
+        fi
+    fi
+done
+trap : 0
--- a/LICENSE
+++ b/LICENSE
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
-# models
+# models 简介
-Model configurations
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://github.com/PaddlePaddle/models)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/models)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+PaddlePaddle提供了丰富的运算单元，帮助大家以模块化的方式构建起千变万化的深度学习模型来解决不同的应用问题。这里，我们针对常见的机器学习任务，提供了不同的神经网络模型供大家学习和使用。
+## 1. 词向量
+词向量用一个实向量表示词语，向量的每个维都表示文本的某种潜在语法或语义特征，是深度学习应用于自然语言处理领域最成功的概念和成果之一。广义的，词向量也可以应用于普通离散特征。词向量的学习通常都是一个无监督的学习过程，因此，可以充分利用海量的无标记数据以捕获特征之间的关系，也可以有效地解决特征稀疏、标签数据缺失、数据噪声等问题。然而，在常见词向量学习方法中，模型最后一层往往会遇到一个超大规模的分类问题，是计算性能的瓶颈。
+在词向量的例子中，我们向大家展示如何使用Hierarchical-Sigmoid 和噪声对比估计（Noise Contrastive Estimation，NCE）来加速词向量的学习。
+- 1.1 [Hsigmoid加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/hsigmoid)
+- 1.2 [噪声对比估计加速词向量训练](https://github.com/PaddlePaddle/models/tree/develop/nce_cost)
+## 2. 使用循环神经网络语言模型生成文本
+语言模型是自然语言处理领域里一个重要的基础模型，除了得到词向量（语言模型训练的副产物），还可以帮助我们生成文本。给定若干个词，语言模型可以帮助我们预测下一个最可能出现的词。在利用语言模型生成文本的例子中，我们重点介绍循环神经网络语言模型，大家可以通过文档中的使用说明快速适配到自己的训练语料，完成自动写诗、自动写散文等有趣的模型。
+- 2.1 [使用循环神经网络语言模型生成文本](https://github.com/PaddlePaddle/models/tree/develop/generate_sequence_by_rnn_lm)
+## 3. 点击率预估
+点击率预估模型预判用户对一条广告点击的概率，对每次广告的点击情况做出预测，是广告技术的核心算法之一。逻谛斯克回归对大规模稀疏特征有着很好的学习能力，在点击率预估任务发展的早期一统天下。近年来，DNN 模型由于其强大的学习能力逐渐接过点击率预估任务的大旗。
+在点击率预估的例子中，我们给出谷歌提出的 Wide & Deep 模型。这一模型融合了适用于学习抽象特征的 DNN 和适用于大规模稀疏特征的逻谛斯克回归两者模型的优点，可以作为一种相对成熟的模型框架使用， 在工业界也有一定的应用。
+- 3.1 [Wide & deep 点击率预估模型](https://github.com/PaddlePaddle/models/tree/develop/ctr)
+## 4. 文本分类
+文本分类是自然语言处理领域最基础的任务之一，深度学习方法能够免除复杂的特征工程，直接使用原始文本作为输入，数据驱动地最优化分类准确率。
+在文本分类的例子中，我们以情感分类任务为例，提供了基于DNN的非序列文本分类模型，以及基于CNN的序列模型供大家学习和使用（基于LSTM的模型见PaddleBook中[情感分类](https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/README.cn.md)一课）。
+- 4.1 [基于 DNN / CNN 的情感分类](https://github.com/PaddlePaddle/models/tree/develop/text_classification)
+## 5. 排序学习
+排序学习(Learning to Rank， LTR)是信息检索和搜索引擎研究的核心问题之一，通过机器学习方法学习一个分值函数对待排序的候选进行打分，再根据分值的高低确定序关系。深度神经网络可以用来建模分值函数，构成各类基于深度学习的LTR模型。
+在排序学习的例子中，我们介绍基于 RankLoss 损失函数的 Pairwise 排序模型和基于LambdaRank损失函数的Listwise排序模型(Pointwise学习策略见PaddleBook中[推荐系统](https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/README.cn.md)一课）。
+- 5.1 [基于 Pairwise 和 Listwise 的排序学习](https://github.com/PaddlePaddle/models/tree/develop/ltr)
+## 6. 序列标注
+给定输入序列，序列标注模型为序列中每一个元素贴上一个类别标签，是自然语言处理领域最基础的任务之一。随着深度学习的不断探索和发展，利用循环神经网络学习输入序列的特征表示，条件随机场（Conditional Random Field, CRF）在特征基础上完成序列标注任务，逐渐成为解决序列标注问题的标配解决方案。
+在序列标注的例子中，我们以命名实体识别（Named Entity Recognition，NER）任务为例，介绍如何训练一个端到端的序列标注模型。
+- 6.1 [命名实体识别](https://github.com/PaddlePaddle/models/tree/develop/sequence_tagging_for_ner)
+## 7. 序列到序列学习
+序列到序列学习实现两个甚至是多个不定长模型之间的映射，有着广泛的应用，包括：机器翻译、智能对话与问答、广告创意语料生成、自动编码（如金融画像编码）、判断多个文本串之间的语义相关性等。
+在序列到序列学习的例子中，我们以机器翻译任务为例，提供了多种改进模型，供大家学习和使用。包括：不带注意力机制的序列到序列映射模型，这一模型是所有序列到序列学习模型的基础；使用 scheduled sampling 改善 RNN 模型在生成任务中的错误累积问题；带外部记忆机制的神经机器翻译，通过增强神经网络的记忆能力，来完成复杂的序列到序列学习任务。
+- 7.1 [无注意力机制的编码器解码器模型](https://github.com/PaddlePaddle/models/tree/develop/nmt_without_attention)
+## 8. 图像分类
+图像相比文字能够提供更加生动、容易理解及更具艺术感的信息，是人们转递与交换信息的重要来源。在图像分类的例子中，我们向大家介绍如何在PaddlePaddle中训练AlexNet、VGG、GoogLeNet和ResNet模型。同时还提供了一个模型转换工具，能够将Caffe训练好的模型文件，转换为PaddlePaddle的模型文件。
+- 8.1 [将Caffe模型文件转换为PaddlePaddle模型文件](https://github.com/PaddlePaddle/models/tree/develop/image_classification/caffe2paddle)
+- 8.2 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.3 [VGG](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+- 8.4 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/image_classification)
+## Copyright and License
+PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
--- a/ctr/README.md
+++ b/ctr/README.md
+# 点击率预估
+## 背景介绍
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
+通常被用来衡量一个在线广告系统的有效性。
+当有多个广告位时，CTR 预估一般会作为排序的基准。
+比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+1.  召回满足 query 的广告集合
+2.  业务规则和相关性过滤
+3.  根据拍卖机制和 CTR 排序
+4.  展出广告
+可以看到，CTR 在最终排序中起到了很重要的作用。
+### 发展阶段
+在业内，CTR 模型经历了如下的发展阶段：
+-   Logistic Regression(LR) / GBDT + 特征工程
+-   LR + DNN 特征
+-   DNN + 特征工程
+在发展早期时 LR 一统天下，但最近 DNN 模型由于其强大的学习能力和逐渐成熟的性能优化，
+逐渐地接过 CTR 预估任务的大旗。
+### LR vs DNN
+下图展示了 LR 和一个 \(3x2\) 的 DNN 模型的结构：
+<p align="center">
+<img src="images/lr_vs_dnn.jpg" width="620" hspace='10'/> <br/>
+Figure 1. LR 和 DNN 模型结构对比
+</p>
+LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
+如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
+这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
+而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
+这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
+本文后面的章节会演示如何使用 PaddlePaddle 编写一个结合两者优点的模型。
+## 数据和任务抽象
+我们可以将 `click` 作为学习目标，任务可以有以下几种方案：
+1.  直接学习 click，0,1 作二元分类
+2.  Learning to rank, 具体用 pairwise rank（标签 1>0）或者 listwise rank
+3.  统计每个广告的点击率，将同一个 query 下的广告两两组合，点击率高的>点击率低的，做 rank 或者分类
+我们直接使用第一种方法做分类任务。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+具体的特征处理方法参看 [data process](./dataset.md)
+## Wide & Deep Learning Model
+谷歌在 16 年提出了 Wide & Deep Learning 的模型框架，用于融合适合学习抽象特征的 DNN 和 适用于大规模稀疏特征的 LR 两种模型的优点。
+### 模型简介
+Wide & Deep Learning Model\[[3](#参考文献)\] 可以作为一种相对成熟的模型框架使用，
+在 CTR 预估的任务中工业界也有一定的应用，因此本文将演示使用此模型来完成 CTR 预估的任务。
+模型结构如下：
+<p align="center">
+<img src="images/wide_deep.png" width="820" hspace='10'/> <br/>
+Figure 2. Wide & Deep Model
+</p>
+模型左边的 Wide 部分，可以容纳大规模系数特征，并且对一些特定的信息（比如 ID）有一定的记忆能力；
+而模型右边的 Deep 部分，能够学习特征间的隐含关系，在相同数量的特征下有更好的学习和推导能力。
+### 编写模型输入
+模型只接受 3 个输入，分别是
+-   `dnn_input` ，也就是 Deep 部分的输入
+-   `lr_input` ，也就是 Wide 部分的输入
+-   `click` ， 点击与否，作为二分类模型学习的标签
+```python
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+```
+### 编写 Wide 部分
+Wide 部分直接使用了 LR 模型，但激活函数改成了 `RELU` 来加速
+```python
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+```
+### 编写 Deep 部分
+Deep 部分使用了标准的多层前向传导的 DNN 模型
+```python
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+```
+### 两者融合
+两个 submodel 的最上层输出加权求和得到整个模型的输出，输出部分使用 `sigmoid` 作为激活函数，得到区间 (0,1) 的预测值，
+来逼近训练数据中二元类别的分布，并最终作为 CTR 预估的值使用。
+```python
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr, wihch is a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+```
+### 训练任务的定义
+```python
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+paddle.init(use_gpu=False, trainer_count=11)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(train_data_path, n_records_as_test=test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" % (
+                event.pass_id, event.batch_id * batch_size, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=1000),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" % (event.pass_id, event.batch_id,
+                                           result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=100)
+```
+## 运行训练和测试
+训练模型需要如下步骤：
+1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+    1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
+    2. 解压 train.gz 得到 train.txt
+2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
+```
+usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
+                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--num_passes NUM_PASSES]
+                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+PaddlePaddle CTR example
+optional arguments:
+  -h, --help            show this help message and exit
+  --train_data_path TRAIN_DATA_PATH
+                        path of training dataset
+  --batch_size BATCH_SIZE
+                        size of mini-batch (default:10000)
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --num_passes NUM_PASSES
+                        number of passes to train
+  --num_lines_to_detact NUM_LINES_TO_DETACT
+                        number of records to detect dataset's meta info
+```
+## 参考文献
+1. <https://en.wikipedia.org/wiki/Click-through_rate>
+2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
+3. Cheng H T, Koc L, Harmsen J, et al. [Wide & deep learning for recommender systems](https://arxiv.org/pdf/1606.07792.pdf)[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.
--- a/ctr/data_provider.py
+++ b/ctr/data_provider.py
+import sys
+import csv
+import numpy as np
+'''
+The fields of the dataset are:
+    0. id: ad identifier
+    1. click: 0/1 for non-click/click
+    2. hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
+    3. C1 -- anonymized categorical variable
+    4. banner_pos
+    5. site_id
+    6. site_domain
+    7. site_category
+    8. app_id
+    9. app_domain
+    10. app_category
+    11. device_id
+    12. device_ip
+    13. device_model
+    14. device_type
+    15. device_conn_type
+    16. C14-C21 -- anonymized categorical variables
+We will treat following fields as categorical features:
+    - C1
+    - banner_pos
+    - site_category
+    - app_category
+    - device_type
+    - device_conn_type
+and some other features as id features:
+    - id
+    - site_id
+    - app_id
+    - device_id
+The `hour` field will be treated as a continuous feature and will be transformed
+to one-hot representation which has 24 bits.
+'''
+feature_dims = {}
+categorial_features = ('C1 banner_pos site_category app_category ' +
+                       'device_type device_conn_type').split()
+id_features = 'id site_id app_id device_id _device_id_cross_site_id'.split()
+def get_all_field_names(mode=0):
+    '''
+    @mode: int
+        0 for train, 1 for test
+    @return: list of str
+    '''
+    return categorial_features + ['hour'] + id_features + ['click'] \
+        if mode == 0 else []
+class CategoryFeatureGenerator(object):
+    '''
+    Generator category features.
+    Register all records by calling `register` first, then call `gen` to generate
+    one-hot representation for a record.
+    '''
+    def __init__(self):
+        self.dic = {'unk': 0}
+        self.counter = 1
+    def register(self, key):
+        '''
+        Register record.
+        '''
+        if key not in self.dic:
+            self.dic[key] = self.counter
+            self.counter += 1
+    def size(self):
+        return len(self.dic)
+    def gen(self, key):
+        '''
+        Generate one-hot representation for a record.
+        '''
+        if key not in self.dic:
+            res = self.dic['unk']
+        else:
+            res = self.dic[key]
+        return [res]
+    def __repr__(self):
+        return '<CategoryFeatureGenerator %d>' % len(self.dic)
+class IDfeatureGenerator(object):
+    def __init__(self, max_dim, cross_fea0=None, cross_fea1=None):
+        '''
+        @max_dim: int
+            Size of the id elements' space
+        '''
+        self.max_dim = max_dim
+        self.cross_fea0 = cross_fea0
+        self.cross_fea1 = cross_fea1
+    def gen(self, key):
+        '''
+        Generate one-hot representation for records
+        '''
+        return [hash(key) % self.max_dim]
+    def gen_cross_fea(self, fea1, fea2):
+        key = str(fea1) + str(fea2)
+        return self.gen(key)
+    def size(self):
+        return self.max_dim
+class ContinuousFeatureGenerator(object):
+    def __init__(self, n_intervals):
+        self.min = sys.maxint
+        self.max = sys.minint
+        self.n_intervals = n_intervals
+    def register(self, val):
+        self.min = min(self.minint, val)
+        self.max = max(self.maxint, val)
+    def gen(self, val):
+        self.len_part = (self.max - self.min) / self.n_intervals
+        return (val - self.min) / self.len_part
+# init all feature generators
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+for key in id_features:
+    # for cross features
+    if 'cross' in key:
+        feas = key[1:].split('_cross_')
+        fields[key] = IDfeatureGenerator(10000000, *feas)
+    # for normal ID features
+    else:
+        fields[key] = IDfeatureGenerator(10000)
+# used as feed_dict in PaddlePaddle
+field_index = dict((key, id)
+                   for id, key in enumerate(['dnn_input', 'lr_input', 'click']))
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+            for key in categorial_features:
+                fields[key].register(row[key])
+    for key, item in fields.items():
+        feature_dims[key] = item.size()
+    #for key in id_features:
+    #feature_dims[key] = id_fea_space
+    feature_dims['hour'] = 24
+    feature_dims['click'] = 1
+    feature_dims['dnn_input'] = np.sum(
+        feature_dims[key] for key in categorial_features + ['hour']) + 1
+    feature_dims['lr_input'] = np.sum(feature_dims[key]
+                                      for key in id_features) + 1
+    return feature_dims
+def concat_sparse_vectors(inputs, dims):
+    '''
+    Concaterate more than one sparse vectors into one.
+    @inputs: list
+        list of sparse vector
+    @dims: list of int
+        dimention of each sparse vector
+    '''
+    res = []
+    assert len(inputs) == len(dims)
+    start = 0
+    for no, vec in enumerate(inputs):
+        for v in vec:
+            res.append(v + start)
+        start += dims[no]
+    return res
+class AvazuDataset(object):
+    '''
+    Load AVAZU dataset as train set.
+    '''
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    def __init__(self, train_path, n_records_as_test=-1):
+        self.train_path = train_path
+        self.n_records_as_test = n_records_as_test
+        # task model: 0 train, 1 test
+        self.mode = 0
+    def train(self):
+        self.mode = self.TRAIN_MODE
+        return self._parse(self.train_path, skip_n_lines=self.n_records_as_test)
+    def test(self):
+        self.mode = self.TEST_MODE
+        return self._parse(self.train_path, top_n_lines=self.n_records_as_test)
+    def _parse(self, path, skip_n_lines=-1, top_n_lines=-1):
+        with open(path, 'rb') as csvfile:
+            reader = csv.DictReader(csvfile)
+            categorial_dims = [
+                feature_dims[key] for key in categorial_features + ['hour']
+            ]
+            id_dims = [feature_dims[key] for key in id_features]
+            for row_id, row in enumerate(reader):
+                if skip_n_lines > 0 and row_id < skip_n_lines:
+                    continue
+                if top_n_lines > 0 and row_id > top_n_lines:
+                    break
+                record = []
+                for key in categorial_features:
+                    record.append(fields[key].gen(row[key]))
+                record.append([int(row['hour'][-2:])])
+                dense_input = concat_sparse_vectors(record, categorial_dims)
+                record = []
+                for key in id_features:
+                    if 'cross' not in key:
+                        record.append(fields[key].gen(row[key]))
+                    else:
+                        fea0 = fields[key].cross_fea0
+                        fea1 = fields[key].cross_fea1
+                        record.append(
+                            fields[key].gen_cross_fea(row[fea0], row[fea1]))
+                sparse_input = concat_sparse_vectors(record, id_dims)
+                record = [dense_input, sparse_input]
+                record.append(list((int(row['click']), )))
+                yield record
+if __name__ == '__main__':
+    path = 'train.txt'
+    print detect_dataset(path, 400000)
+    filereader = AvazuDataset(path)
+    for no, rcd in enumerate(filereader.train()):
+        print no, rcd
+        if no > 1000: break
--- a/ctr/dataset.md
+++ b/ctr/dataset.md
+# 数据及处理
+## 数据集介绍
+数据集使用 `csv` 格式存储，其中各个字段内容如下：
+-   `id` : ad identifier
+-   `click` : 0/1 for non-click/click
+-   `hour` : format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
+-   `C1` : anonymized categorical variable
+-   `banner_pos`
+-   `site_id`
+-   `site_domain`
+-   `site_category`
+-   `app_id`
+-   `app_domain`
+-   `app_category`
+-   `device_id`
+-   `device_ip`
+-   `device_model`
+-   `device_type`
+-   `device_conn_type`
+-   `C14-C21` : anonymized categorical variables
+## 特征提取
+下面我们会简单演示几种特征的提取方式。
+原始数据中的特征可以分为以下几类：
+1.  ID 类特征（稀疏，数量多）
+-   `id`
+-   `site_id`
+-   `app_id`
+-   `device_id`
+2.  类别类特征（稀疏，但数量有限）
+-   `C1`
+-   `site_category`
+-   `device_type`
+-   `C14-C21`
+3.  数值型特征转化为类别型特征
+-   hour (可以转化成数值，也可以按小时为单位转化为类别）
+### 类别类特征
+类别类特征的提取方法有以下两种：
+1.  One-hot 表示作为特征
+2.  类似词向量，用一个 Embedding 将每个类别映射到对应的向量
+### ID 类特征
+ID 类特征的特点是稀疏数据，但量比较大，直接使用 One-hot 表示时维度过大。
+一般会作如下处理：
+1.  确定表示的最大维度 N
+2.  newid = id % N
+3.  用 newid 作为类别类特征使用
+上面的方法尽管存在一定的碰撞概率，但能够处理任意数量的 ID 特征，并保留一定的效果\[[2](#参考文献)\]。
+### 数值型特征
+一般会做如下处理：
+-   归一化，直接作为特征输入模型
+-   用区间分割处理成类别类特征，稀疏化表示，模糊细微上的差别
+## 特征处理
+### 类别型特征
+类别型特征有有限多种值，在模型中，我们一般使用 Embedding将每种值映射为连续值的向量。
+这种特征在输入到模型时，一般使用 One-hot 表示，相关处理方法如下：
+```python
+class CategoryFeatureGenerator(object):
+    '''
+    Generator category features.
+    Register all records by calling ~register~ first, then call ~gen~ to generate
+    one-hot representation for a record.
+    '''
+    def __init__(self):
+        self.dic = {'unk': 0}
+        self.counter = 1
+    def register(self, key):
+        '''
+        Register record.
+        '''
+        if key not in self.dic:
+            self.dic[key] = self.counter
+            self.counter += 1
+    def size(self):
+        return len(self.dic)
+    def gen(self, key):
+        '''
+        Generate one-hot representation for a record.
+        '''
+        if key not in self.dic:
+            res = self.dic['unk']
+        else:
+            res = self.dic[key]
+        return [res]
+    def __repr__(self):
+        return '<CategoryFeatureGenerator %d>' % len(self.dic)
+```
+`CategoryFeatureGenerator` 需要先扫描数据集，得到该类别对应的项集合，之后才能开始生成特征。
+我们的实验数据集\[[3](https://www.kaggle.com/c/avazu-ctr-prediction/data)\]已经经过shuffle，可以扫描前面一定数目的记录来近似总的类别项集合（等价于随机抽样），
+对于没有抽样上的低频类别项，可以用一个 UNK 的特殊值表示。
+```python
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+            for key in categorial_features:
+                fields[key].register(row[key])
+```
+`CategoryFeatureGenerator` 在注册得到数据集中对应类别信息后，可以对相应记录生成对应的特征表示：
+```python
+record = []
+for key in categorial_features:
+    record.append(fields[key].gen(row[key]))
+```
+本任务中，类别类特征会输入到 DNN 中使用。
+### ID 类特征
+ID 类特征代稀疏值，且值的空间很大的情况，一般用模操作规约到一个有限空间，
+之后可以当成类别类特征使用，这里我们会将 ID 类特征输入到 LR 模型中使用。
+```python
+class IDfeatureGenerator(object):
+    def __init__(self, max_dim):
+        '''
+        @max_dim: int
+            Size of the id elements' space
+        '''
+        self.max_dim = max_dim
+    def gen(self, key):
+        '''
+        Generate one-hot representation for records
+        '''
+        return [hash(key) % self.max_dim]
+    def size(self):
+        return self.max_dim
+```
+`IDfeatureGenerator` 不需要预先初始化，可以直接生成特征，比如
+```python
+record = []
+for key in id_features:
+    if 'cross' not in key:
+        record.append(fields[key].gen(row[key]))
+```
+### 交叉类特征
+LR 模型作为 Wide & Deep model 的 `wide` 部分，可以输入很 wide 的数据（特征空间的维度很大），
+为了充分利用这个优势，我们将演示交叉组合特征构建成更大维度特征的情况，之后塞入到模型中训练。
+这里我们依旧使用模操作来约束最终组合出的特征空间的大小，具体实现是直接在 `IDfeatureGenerator` 中添加一个 `gen_cross_feature` 的方法：
+```python
+def gen_cross_fea(self, fea1, fea2):
+    key = str(fea1) + str(fea2)
+    return self.gen(key)
+```
+比如，我们觉得原始数据中， `device_id` 和 `site_id` 有一些关联（比如某个 device 倾向于浏览特定 site)，
+我们通过组合出两者组合来捕捉这类信息。
+```python
+fea0 = fields[key].cross_fea0
+fea1 = fields[key].cross_fea1
+record.append(
+    fields[key].gen_cross_fea(row[fea0], row[fea1]))
+```
+### 特征维度
+#### Deep submodel(DNN)特征
+| feature          | dimention |
+|------------------|-----------|
+| app_category     |        21 |
+| site_category    |        22 |
+| device_conn_type |         5 |
+| hour             |        24 |
+| banner_pos       |         7 |
+| **Total**        | 79        |
+#### Wide submodel(LR)特征
+| Feature             | Dimention |
+|---------------------|-----------|
+| id                  |     10000 |
+| site_id             |     10000 |
+| app_id              |     10000 |
+| device_id           |     10000 |
+| device_id X site_id |   1000000 |
+| **Total**           | 1,040,000 |
+## 输入到 PaddlePaddle 中
+Deep 和 Wide 两部分均以 `sparse_binary_vector` 的格式 \[[1](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst)\] 输入，输入前需要将相关特征拼合，模型最终只接受 3 个 input，
+分别是
+1.  `dnn input` ，DNN 的输入
+2.  `lr input` , LR 的输入
+3.  `click`  ， 标签
+拼合特征的方法：
+```python
+def concat_sparse_vectors(inputs, dims):
+    '''
+    concaterate sparse vectors into one
+    @inputs: list
+        list of sparse vector
+    @dims: list of int
+        dimention of each sparse vector
+    '''
+    res = []
+    assert len(inputs) == len(dims)
+    start = 0
+    for no, vec in enumerate(inputs):
+        for v in vec:
+            res.append(v + start)
+        start += dims[no]
+    return res
+```
+生成最终特征的代码如下：
+```python
+# dimentions of the features
+categorial_dims = [
+    feature_dims[key] for key in categorial_features + ['hour']
+]
+id_dims = [feature_dims[key] for key in id_features]
+dense_input = concat_sparse_vectors(record, categorial_dims)
+sparse_input = concat_sparse_vectors(record, id_dims)
+record = [dense_input, sparse_input]
+record.append(list((int(row['click']), )))
+yield record
+```
+## 参考文献
+1. <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/api/v1/data_provider/pydataprovider2_en.rst>
+2. Mikolov T, Deoras A, Povey D, et al. [Strategies for training large scale neural network language models](https://www.researchgate.net/profile/Lukas_Burget/publication/241637478_Strategies_for_training_large_scale_neural_network_language_models/links/542c14960cf27e39fa922ed3.pdf)[C]//Automatic Speech Recognition and Understanding (ASRU), 2011 IEEE Workshop on. IEEE, 2011: 196-201.
+3. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
--- a/ctr/images/lr_vs_dnn.jpg
+++ b/ctr/images/lr_vs_dnn.jpg
--- a/ctr/images/wide_deep.png
+++ b/ctr/images/wide_deep.png
--- a/ctr/index.html
+++ b/ctr/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# 点击率预估
+## 背景介绍
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
+通常被用来衡量一个在线广告系统的有效性。
+当有多个广告位时，CTR 预估一般会作为排序的基准。
+比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+1.  召回满足 query 的广告集合
+2.  业务规则和相关性过滤
+3.  根据拍卖机制和 CTR 排序
+4.  展出广告
+可以看到，CTR 在最终排序中起到了很重要的作用。
+### 发展阶段
+在业内，CTR 模型经历了如下的发展阶段：
+-   Logistic Regression(LR) / GBDT + 特征工程
+-   LR + DNN 特征
+-   DNN + 特征工程
+在发展早期时 LR 一统天下，但最近 DNN 模型由于其强大的学习能力和逐渐成熟的性能优化，
+逐渐地接过 CTR 预估任务的大旗。
+### LR vs DNN
+下图展示了 LR 和一个 \(3x2\) 的 DNN 模型的结构：
+<p align="center">
+<img src="images/lr_vs_dnn.jpg" width="620" hspace='10'/> <br/>
+Figure 1. LR 和 DNN 模型结构对比
+</p>
+LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
+如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
+这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
+而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
+这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
+本文后面的章节会演示如何使用 PaddlePaddle 编写一个结合两者优点的模型。
+## 数据和任务抽象
+我们可以将 `click` 作为学习目标，任务可以有以下几种方案：
+1.  直接学习 click，0,1 作二元分类
+2.  Learning to rank, 具体用 pairwise rank（标签 1>0）或者 listwise rank
+3.  统计每个广告的点击率，将同一个 query 下的广告两两组合，点击率高的>点击率低的，做 rank 或者分类
+我们直接使用第一种方法做分类任务。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+具体的特征处理方法参看 [data process](./dataset.md)
+## Wide & Deep Learning Model
+谷歌在 16 年提出了 Wide & Deep Learning 的模型框架，用于融合适合学习抽象特征的 DNN 和 适用于大规模稀疏特征的 LR 两种模型的优点。
+### 模型简介
+Wide & Deep Learning Model\[[3](#参考文献)\] 可以作为一种相对成熟的模型框架使用，
+在 CTR 预估的任务中工业界也有一定的应用，因此本文将演示使用此模型来完成 CTR 预估的任务。
+模型结构如下：
+<p align="center">
+<img src="images/wide_deep.png" width="820" hspace='10'/> <br/>
+Figure 2. Wide & Deep Model
+</p>
+模型左边的 Wide 部分，可以容纳大规模系数特征，并且对一些特定的信息（比如 ID）有一定的记忆能力；
+而模型右边的 Deep 部分，能够学习特征间的隐含关系，在相同数量的特征下有更好的学习和推导能力。
+### 编写模型输入
+模型只接受 3 个输入，分别是
+-   `dnn_input` ，也就是 Deep 部分的输入
+-   `lr_input` ，也就是 Wide 部分的输入
+-   `click` ， 点击与否，作为二分类模型学习的标签
+```python
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+```
+### 编写 Wide 部分
+Wide 部分直接使用了 LR 模型，但激活函数改成了 `RELU` 来加速
+```python
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+```
+### 编写 Deep 部分
+Deep 部分使用了标准的多层前向传导的 DNN 模型
+```python
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+```
+### 两者融合
+两个 submodel 的最上层输出加权求和得到整个模型的输出，输出部分使用 `sigmoid` 作为激活函数，得到区间 (0,1) 的预测值，
+来逼近训练数据中二元类别的分布，并最终作为 CTR 预估的值使用。
+```python
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr, wihch is a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+```
+### 训练任务的定义
+```python
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+paddle.init(use_gpu=False, trainer_count=11)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(train_data_path, n_records_as_test=test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" % (
+                event.pass_id, event.batch_id * batch_size, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=1000),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" % (event.pass_id, event.batch_id,
+                                           result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=100)
+```
+## 运行训练和测试
+训练模型需要如下步骤：
+1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+    1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
+    2. 解压 train.gz 得到 train.txt
+2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
+```
+usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
+                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--num_passes NUM_PASSES]
+                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+PaddlePaddle CTR example
+optional arguments:
+  -h, --help            show this help message and exit
+  --train_data_path TRAIN_DATA_PATH
+                        path of training dataset
+  --batch_size BATCH_SIZE
+                        size of mini-batch (default:10000)
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --num_passes NUM_PASSES
+                        number of passes to train
+  --num_lines_to_detact NUM_LINES_TO_DETACT
+                        number of records to detect dataset's meta info
+```
+## 参考文献
+1. <https://en.wikipedia.org/wiki/Click-through_rate>
+2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
+3. Cheng H T, Koc L, Harmsen J, et al. [Wide & deep learning for recommender systems](https://arxiv.org/pdf/1606.07792.pdf)[C]//Proceedings of the 1st Workshop on Deep Learning for Recommender Systems. ACM, 2016: 7-10.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/ctr/train.py
+++ b/ctr/train.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import logging
+import paddle.v2 as paddle
+from paddle.v2 import layer
+from paddle.v2 import data_type as dtype
+from data_provider import field_index, detect_dataset, AvazuDataset
+parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+parser.add_argument(
+    '--train_data_path',
+    type=str,
+    required=True,
+    help="path of training dataset")
+parser.add_argument(
+    '--batch_size',
+    type=int,
+    default=10000,
+    help="size of mini-batch (default:10000)")
+parser.add_argument(
+    '--test_set_size',
+    type=int,
+    default=10000,
+    help="size of the validation dataset(default: 10000)")
+parser.add_argument(
+    '--num_passes', type=int, default=10, help="number of passes to train")
+parser.add_argument(
+    '--num_lines_to_detact',
+    type=int,
+    default=500000,
+    help="number of records to detect dataset's meta info")
+args = parser.parse_args()
+dnn_layer_dims = [128, 64, 32, 1]
+data_meta_info = detect_dataset(args.train_data_path, args.num_lines_to_detact)
+logging.warning('detect categorical fields in dataset %s' %
+                args.train_data_path)
+for key, item in data_meta_info.items():
+    logging.warning('    - {}\t{}'.format(key, item))
+paddle.init(use_gpu=False, trainer_count=1)
+# ==============================================================================
+#                    input layers
+# ==============================================================================
+dnn_merged_input = layer.data(
+    name='dnn_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
+lr_merged_input = layer.data(
+    name='lr_input',
+    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
+click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+# ==============================================================================
+#                    network structure
+# ==============================================================================
+def build_dnn_submodel(dnn_layer_dims):
+    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
+    _input_layer = dnn_embedding
+    for i, dim in enumerate(dnn_layer_dims[1:]):
+        fc = layer.fc(
+            input=_input_layer,
+            size=dim,
+            act=paddle.activation.Relu(),
+            name='dnn-fc-%d' % i)
+        _input_layer = fc
+    return _input_layer
+# config LR submodel
+def build_lr_submodel():
+    fc = layer.fc(
+        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
+    return fc
+# conbine DNN and LR submodels
+def combine_submodels(dnn, lr):
+    merge_layer = layer.concat(input=[dnn, lr])
+    fc = layer.fc(
+        input=merge_layer,
+        size=1,
+        name='output',
+        # use sigmoid function to approximate ctr rate, a float value between 0 and 1.
+        act=paddle.activation.Sigmoid())
+    return fc
+dnn = build_dnn_submodel(dnn_layer_dims)
+lr = build_lr_submodel()
+output = combine_submodels(dnn, lr)
+# ==============================================================================
+#                   cost and train period
+# ==============================================================================
+classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+    input=output, label=click)
+params = paddle.parameters.create(classification_cost)
+optimizer = paddle.optimizer.Momentum(momentum=0.01)
+trainer = paddle.trainer.SGD(
+    cost=classification_cost, parameters=params, update_equation=optimizer)
+dataset = AvazuDataset(
+    args.train_data_path, n_records_as_test=args.test_set_size)
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        num_samples = event.batch_id * args.batch_size
+        if event.batch_id % 100 == 0:
+            logging.warning("Pass %d, Samples %d, Cost %f" %
+                            (event.pass_id, num_samples, event.cost))
+        if event.batch_id % 1000 == 0:
+            result = trainer.test(
+                reader=paddle.batch(dataset.test, batch_size=args.batch_size),
+                feeding=field_index)
+            logging.warning("Test %d-%d, Cost %f" %
+                            (event.pass_id, event.batch_id, result.cost))
+trainer.train(
+    reader=paddle.batch(
+        paddle.reader.shuffle(dataset.train, buf_size=500),
+        batch_size=args.batch_size),
+    feeding=field_index,
+    event_handler=event_handler,
+    num_passes=args.num_passes)
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
+# Deep Speech 2 on PaddlePaddle
+## Installation
+Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
+```
+sh setup.sh
+export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
+```
+For some machines, we also need to install libsndfile1. Details to be added.
+## Usage
+### Preparing Data
+```
+cd datasets
+sh run_all.sh
+cd ..
+```
+`sh run_all.sh` prepares all ASR datasets (currently, only LibriSpeech available). After running, we have several summarization manifest files in json-format.
+A manifest file summarizes a speech data set, with each line containing the meta data (i.e. audio filepath, transcript text, audio duration) of each audio file within the data set, in json format. Manifest file serves as an interface informing our system of  where and what to read the speech samples.
+More help for arguments:
+```
+python datasets/librispeech/librispeech.py --help
+```
+### Preparing for Training
+```
+python compute_mean_std.py
+```
+`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing.
+More help for arguments:
+```
+python compute_mean_std.py --help
+```
+### Training
+For GPU Training:
+```
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py
+```
+For CPU Training:
+```
+python train.py --use_gpu False
+```
+More help for arguments:
+```
+python train.py --help
+```
+### Inferencing
+```
+CUDA_VISIBLE_DEVICES=0 python infer.py
+```
+More help for arguments:
+```
+python infer.py --help
+```
--- a/deep_speech_2/compute_mean_std.py
+++ b/deep_speech_2/compute_mean_std.py
+"""Compute mean and std for feature normalizer, and save to file."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+from data_utils.normalizer import FeatureNormalizer
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+parser = argparse.ArgumentParser(
+    description='Computing mean and stddev for feature normalizer.')
+parser.add_argument(
+    "--manifest_path",
+    default='datasets/manifest.train',
+    type=str,
+    help="Manifest path for computing normalizer's mean and stddev."
+    "(default: %(default)s)")
+parser.add_argument(
+    "--num_samples",
+    default=2000,
+    type=int,
+    help="Number of samples for computing mean and stddev. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='{}',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--output_file",
+    default='mean_std.npz',
+    type=str,
+    help="Filepath to write mean and std to (.npz)."
+    "(default: %(default)s)")
+args = parser.parse_args()
+def main():
+    augmentation_pipeline = AugmentationPipeline(args.augmentation_config)
+    audio_featurizer = AudioFeaturizer()
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=args.manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=args.num_samples)
+    normalizer.write_to_file(args.output_file)
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/data_utils/__init__.py
+++ b/deep_speech_2/data_utils/__init__.py
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
--- a/deep_speech_2/data_utils/augmentor/__init__.py
+++ b/deep_speech_2/data_utils/augmentor/__init__.py
--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
+"""Contains the data augmentation pipeline."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+import random
+from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
+from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
+from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
+from data_utils.augmentor.resample import ResampleAugmentor
+from data_utils.augmentor.online_bayesian_normalization import \
+     OnlineBayesianNormalizationAugmentor
+class AugmentationPipeline(object):
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    .. code-block::
+        '[{"type": "volume",
+           "params": {"min_gain_dBFS": -15,
+                      "max_gain_dBFS": 15},
+           "prob": 0.5},
+          {"type": "speed",
+           "params": {"min_speed_rate": 0.8,
+                      "max_speed_rate": 1.2},
+           "prob": 0.5}
+         ]' 
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect.
+    :param augmentation_config: Augmentation configuration in json string.
+    :type augmentation_config: str
+    :param random_seed: Random seed.
+    :type random_seed: int
+    :raises ValueError: If the augmentation json config is in incorrect format".
+    """
+    def __init__(self, augmentation_config, random_seed=0):
+        self._rng = random.Random(random_seed)
+        self._augmentors, self._rates = self._parse_pipeline_from(
+            augmentation_config)
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            if self._rng.uniform(0., 1.) <= rate:
+                augmentor.transform_audio(audio_segment)
+    def _parse_pipeline_from(self, config_json):
+        """Parse the config json to build a augmentation pipelien."""
+        try:
+            configs = json.loads(config_json)
+            augmentors = [
+                self._get_augmentor(config["type"], config["params"])
+                for config in configs
+            ]
+            rates = [config["prob"] for config in configs]
+        except Exception as e:
+            raise ValueError("Failed to parse the augmentation config json: "
+                             "%s" % str(e))
+        return augmentors, rates
+    def _get_augmentor(self, augmentor_type, params):
+        """Return an augmentation model by the type name, and pass in params."""
+        if augmentor_type == "volume":
+            return VolumePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "shift":
+            return ShiftPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "speed":
+            return SpeedPerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "resample":
+            return ResampleAugmentor(self._rng, **params)
+        elif augmentor_type == "bayesian_normal":
+            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
+        else:
+            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
--- a/deep_speech_2/data_utils/augmentor/base.py
+++ b/deep_speech_2/data_utils/augmentor/base.py
+"""Contains the abstract base class for augmentation models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from abc import ABCMeta, abstractmethod
+class AugmentorBase(object):
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        pass
--- a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+++ b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
+"""Contain the online bayesian normalization augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class OnlineBayesianNormalizationAugmentor(AugmentorBase):
+    """Augmentation model for adding online bayesian normalization.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param target_db: Target RMS value in decibels.
+    :type target_db: float
+    :param prior_db: Prior RMS estimate in decibels.
+    :type prior_db: float
+    :param prior_samples: Prior strength in number of samples.
+    :type prior_samples: int
+    :param startup_delay: Default 0.0s. If provided, this function will
+                          accrue statistics for the first startup_delay 
+                          seconds before applying online normalization.
+    :type starup_delay: float.
+    """
+    def __init__(self,
+                 rng,
+                 target_db,
+                 prior_db,
+                 prior_samples,
+                 startup_delay=0.0):
+        self._target_db = target_db
+        self._prior_db = prior_db
+        self._prior_samples = prior_samples
+        self._rng = rng
+        self._startup_delay = startup_delay
+    def transform_audio(self, audio_segment):
+        """Normalizes the input audio using the online Bayesian approach.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        audio_segment.normalize_online_bayesian(self._target_db, self._prior_db,
+                                                self._prior_samples,
+                                                self._startup_delay)
--- a/deep_speech_2/data_utils/augmentor/resample.py
+++ b/deep_speech_2/data_utils/augmentor/resample.py
+"""Contain the resample augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class ResampleAugmentor(AugmentorBase):
+    """Augmentation model for resampling.
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param new_sample_rate: New sample rate in Hz.
+    :type new_sample_rate: int
+    """
+    def __init__(self, rng, new_sample_rate):
+        self._new_sample_rate = new_sample_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Resamples the input audio to a target sample rate.
+        Note that this is an in-place transformation.
+        :param audio: Audio segment to add effects to.
+        :type audio: AudioSegment|SpeechSegment
+        """
+        audio_segment.resample(self._new_sample_rate)
--- a/deep_speech_2/data_utils/augmentor/shift_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/shift_perturb.py
+"""Contains the volume perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class ShiftPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random shift perturbation.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_shift_ms: Minimal shift in milliseconds.
+    :type min_shift_ms: float
+    :param max_shift_ms: Maximal shift in milliseconds.
+    :type max_shift_ms: float
+    """
+    def __init__(self, rng, min_shift_ms, max_shift_ms):
+        self._min_shift_ms = min_shift_ms
+        self._max_shift_ms = max_shift_ms
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Shift audio.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        audio_segment.shift(shift_ms)
--- a/deep_speech_2/data_utils/augmentor/speed_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/speed_perturb.py
+"""Contain the speech perturbation augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class SpeedPerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding speed perturbation.
+    See reference paper here:
+    http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_speed_rate: Lower bound of new speed rate to sample and should
+                           not be smaller than 0.9.
+    :type min_speed_rate: float
+    :param max_speed_rate: Upper bound of new speed rate to sample and should
+                           not be larger than 1.1.
+    :type max_speed_rate: float
+    """
+    def __init__(self, rng, min_speed_rate, max_speed_rate):
+        if min_speed_rate < 0.9:
+            raise ValueError(
+                "Sampling speed below 0.9 can cause unnatural effects")
+        if max_speed_rate > 1.1:
+            raise ValueError(
+                "Sampling speed above 1.1 can cause unnatural effects")
+        self._min_speed_rate = min_speed_rate
+        self._max_speed_rate = max_speed_rate
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Sample a new speed rate from the given range and
+        changes the speed of the given audio clip.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegment|SpeechSegment
+        """
+        sampled_speed = self._rng.uniform(self._min_speed_rate,
+                                          self._max_speed_rate)
+        audio_segment.change_speed(sampled_speed)
--- a/deep_speech_2/data_utils/augmentor/volume_perturb.py
+++ b/deep_speech_2/data_utils/augmentor/volume_perturb.py
+"""Contains the volume perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.augmentor.base import AugmentorBase
+class VolumePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding random volume perturbation.
+    This is used for multi-loudness training of PCEN. See
+    https://arxiv.org/pdf/1607.05666v1.pdf
+    for more details.
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_gain_dBFS: Minimal gain in dBFS.
+    :type min_gain_dBFS: float
+    :param max_gain_dBFS: Maximal gain in dBFS.
+    :type max_gain_dBFS: float
+    """
+    def __init__(self, rng, min_gain_dBFS, max_gain_dBFS):
+        self._min_gain_dBFS = min_gain_dBFS
+        self._max_gain_dBFS = max_gain_dBFS
+        self._rng = rng
+    def transform_audio(self, audio_segment):
+        """Change audio loadness.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
+        audio_segment.gain_db(gain)
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
+"""Contains data generator for orgnaizing various audio data preprocessing
+pipeline and offering data reader interface of PaddlePaddle requirements.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+import numpy as np
+import multiprocessing
+import paddle.v2 as paddle
+from data_utils import utils
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
+from data_utils.speech import SpeechSegment
+from data_utils.normalizer import FeatureNormalizer
+class DataGenerator(object):
+    """
+    DataGenerator provides basic audio data preprocessing pipeline, and offers
+    data reader interfaces of PaddlePaddle requirements.
+    :param vocab_filepath: Vocabulary filepath for indexing tokenized
+                           transcripts.
+    :type vocab_filepath: basestring
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|basestring
+    :param augmentation_config: Augmentation configuration in json string.
+                                Details see AugmentationPipeline.__doc__.
+    :type augmentation_config: str
+    :param max_duration: Audio with duration (in seconds) greater than
+                         this will be discarded.
+    :type max_duration: float
+    :param min_duration: Audio with duration (in seconds) smaller than
+                         this will be discarded.
+    :type min_duration: float
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param use_dB_normalization: Whether to normalize the audio to -20 dB
+                                 before extracting the features.
+    :type use_dB_normalization: bool
+    :param num_threads: Number of CPU threads for processing data.
+    :type num_threads: int
+    :param random_seed: Random seed.
+    :type random_seed: int
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 mean_std_filepath,
+                 augmentation_config='{}',
+                 max_duration=float('inf'),
+                 min_duration=0.0,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 specgram_type='linear',
+                 use_dB_normalization=True,
+                 num_threads=multiprocessing.cpu_count(),
+                 random_seed=0):
+        self._max_duration = max_duration
+        self._min_duration = min_duration
+        self._normalizer = FeatureNormalizer(mean_std_filepath)
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=augmentation_config, random_seed=random_seed)
+        self._speech_featurizer = SpeechFeaturizer(
+            vocab_filepath=vocab_filepath,
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            use_dB_normalization=use_dB_normalization)
+        self._num_threads = num_threads
+        self._rng = random.Random(random_seed)
+        self._epoch = 0
+    def batch_reader_creator(self,
+                             manifest_path,
+                             batch_size,
+                             min_batch_size=1,
+                             padding_to=-1,
+                             flatten=False,
+                             sortagrad=False,
+                             shuffle_method="batch_shuffle"):
+        """
+        Batch data reader creator for audio data. Return a callable generator
+        function to produce batches of data.
+        Audio features within one batch will be padded with zeros to have the
+        same shape, or a user-defined shape.
+        :param manifest_path: Filepath of manifest for audio files.
+        :type manifest_path: basestring
+        :param batch_size: Number of instances in a batch.
+        :type batch_size: int
+        :param min_batch_size: Any batch with batch size smaller than this will
+                               be discarded. (To be deprecated in the future.)
+        :type min_batch_size: int
+        :param padding_to:  If set -1, the maximun shape in the batch
+                            will be used as the target shape for padding.
+                            Otherwise, `padding_to` will be the target shape.
+        :type padding_to: int
+        :param flatten: If set True, audio features will be flatten to 1darray.
+        :type flatten: bool
+        :param sortagrad: If set True, sort the instances by audio duration
+                          in the first epoch for speed up training.
+        :type sortagrad: bool
+        :param shuffle_method: Shuffle method. Options:
+                                '' or None: no shuffle.
+                                'instance_shuffle': instance-wise shuffle.
+                                'batch_shuffle': similarly-sized instances are
+                                                 put into batches, and then
+                                                 batch-wise shuffle the batches.
+                                                 For more details, please see
+                                                 ``_batch_shuffle.__doc__``.
+                                'batch_shuffle_clipped': 'batch_shuffle' with
+                                                         head shift and tail
+                                                         clipping. For more
+                                                         details, please see
+                                                         ``_batch_shuffle``.
+                              If sortagrad is True, shuffle is disabled
+                              for the first epoch.
+        :type shuffle_method: None|str
+        :return: Batch reader function, producing batches of data when called.
+        :rtype: callable
+        """
+        def batch_reader():
+            # read manifest
+            manifest = utils.read_manifest(
+                manifest_path=manifest_path,
+                max_duration=self._max_duration,
+                min_duration=self._min_duration)
+            # sort (by duration) or batch-wise shuffle the manifest
+            if self._epoch == 0 and sortagrad:
+                manifest.sort(key=lambda x: x["duration"])
+            else:
+                if shuffle_method == "batch_shuffle":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=False)
+                elif shuffle_method == "batch_shuffle_clipped":
+                    manifest = self._batch_shuffle(
+                        manifest, batch_size, clipped=True)
+                elif shuffle_method == "instance_shuffle":
+                    self._rng.shuffle(manifest)
+                elif not shuffle_method:
+                    pass
+                else:
+                    raise ValueError("Unknown shuffle method %s." %
+                                     shuffle_method)
+            # prepare batches
+            instance_reader = self._instance_reader_creator(manifest)
+            batch = []
+            for instance in instance_reader():
+                batch.append(instance)
+                if len(batch) == batch_size:
+                    yield self._padding_batch(batch, padding_to, flatten)
+                    batch = []
+            if len(batch) >= min_batch_size:
+                yield self._padding_batch(batch, padding_to, flatten)
+            self._epoch += 1
+        return batch_reader
+    @property
+    def feeding(self):
+        """Returns data reader's feeding dict.
+        :return: Data feeding dict.
+        :rtype: dict 
+        """
+        return {"audio_spectrogram": 0, "transcript_text": 1}
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._speech_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._speech_featurizer.vocab_list
+    def _process_utterance(self, filename, transcript):
+        """Load, augment, featurize and normalize for speech data."""
+        speech_segment = SpeechSegment.from_file(filename, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, text_ids
+    def _instance_reader_creator(self, manifest):
+        """
+        Instance reader creator. Create a callable function to produce
+        instances of data.
+        Instance: a tuple of ndarray of audio spectrogram and a list of
+        token indices for transcript.
+        """
+        def reader():
+            for instance in manifest:
+                yield instance
+        def mapper(instance):
+            return self._process_utterance(instance["audio_filepath"],
+                                           instance["text"])
+        return paddle.reader.xmap_readers(
+            mapper, reader, self._num_threads, 1024, order=True)
+    def _padding_batch(self, batch, padding_to=-1, flatten=False):
+        """
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one bach.
+        If ``padding_to`` is -1, the maximun shape in the batch will be used
+        as the target shape for padding. Otherwise, `padding_to` will be the
+        target shape (only refers to the second axis).
+        If `flatten` is True, features will be flatten to 1darray.
+        """
+        new_batch = []
+        # get target shape
+        max_length = max([audio.shape[1] for audio, text in batch])
+        if padding_to != -1:
+            if padding_to < max_length:
+                raise ValueError("If padding_to is not -1, it should be larger "
+                                 "than any instance's shape in the batch")
+            max_length = padding_to
+        # padding
+        for audio, text in batch:
+            padded_audio = np.zeros([audio.shape[0], max_length])
+            padded_audio[:, :audio.shape[1]] = audio
+            if flatten:
+                padded_audio = padded_audio.flatten()
+            new_batch.append((padded_audio, text))
+        return new_batch
+    def _batch_shuffle(self, manifest, batch_size, clipped=False):
+        """Put similarly-sized instances into minibatches for better efficiency
+        and make a batch-wise shuffle.
+        1. Sort the audio clips by duration.
+        2. Generate a random number `k`, k in [0, batch_size).
+        3. Randomly shift `k` instances in order to create different batches
+           for different epochs. Create minibatches.
+        4. Shuffle the minibatches.
+        :param manifest: Manifest contents. List of dict.
+        :type manifest: list
+        :param batch_size: Batch size. This size is also used for generate
+                           a random number for batch shuffle.
+        :type batch_size: int
+        :param clipped: Whether to clip the heading (small shift) and trailing
+                        (incomplete batch) instances.
+        :type clipped: bool
+        :return: Batch shuffled mainifest.
+        :rtype: list
+        """
+        manifest.sort(key=lambda x: x["duration"])
+        shift_len = self._rng.randint(0, batch_size - 1)
+        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
+        self._rng.shuffle(batch_manifest)
+        batch_manifest = list(sum(batch_manifest, ()))
+        if not clipped:
+            res_len = len(manifest) - shift_len - len(batch_manifest)
+            batch_manifest.extend(manifest[-res_len:])
+            batch_manifest.extend(manifest[0:shift_len])
+        return batch_manifest
--- a/deep_speech_2/data_utils/featurizer/__init__.py
+++ b/deep_speech_2/data_utils/featurizer/__init__.py
--- a/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+"""Contains the audio featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from data_utils import utils
+from data_utils.audio import AudioSegment
+class AudioFeaturizer(object):
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+    Currently, it only supports feature type of linear spectrogram.
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._specgram_type = specgram_type
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsamplling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment.samples,
+                                      audio_segment.sample_rate)
+    def _compute_specgram(self, samples, sample_rate):
+        """Extract various audio features."""
+        if self._specgram_type == 'linear':
+            return self._compute_linear_specgram(
+                samples, sample_rate, self._stride_ms, self._window_ms,
+                self._max_freq)
+        else:
+            raise ValueError("Unknown specgram_type %s. "
+                             "Supported values: linear." % self._specgram_type)
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        return np.log(specgram[:ind, :] + eps)
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        fft = np.fft.rfft(windows * weighting, axis=0)
+        fft = np.absolute(fft)**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+"""Contains the speech featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+from data_utils.featurizer.text_featurizer import TextFeaturizer
+class SpeechFeaturizer(object):
+    """Speech featurizer, for extracting features from both audio and transcript
+    contents of SpeechSegment.
+    Currently, for audio parts, it only supports feature type of linear
+    spectrogram; for transcript parts, it only supports char-level tokenizing
+    and conversion into a list of token indices. Note that the token indexing
+    order follows the given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: basestring
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned.
+    :types max_freq: None|float
+    :param target_sample_rate: Speech are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 vocab_filepath,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._audio_featurizer = AudioFeaturizer(
+            specgram_type=specgram_type,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB)
+        self._text_featurizer = TextFeaturizer(vocab_filepath)
+    def featurize(self, speech_segment):
+        """Extract features for speech segment.
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, convert text string to a list of token indices
+           in char-level.
+        :param audio_segment: Speech segment to extract features from.
+        :type audio_segment: SpeechSegment
+        :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
+                 char-level token indices.
+        :rtype: tuple
+        """
+        audio_feature = self._audio_featurizer.featurize(speech_segment)
+        text_ids = self._text_featurizer.featurize(speech_segment.transcript)
+        return audio_feature, text_ids
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return self._text_featurizer.vocab_size
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._text_featurizer.vocab_list
--- a/deep_speech_2/data_utils/featurizer/text_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/text_featurizer.py
+"""Contains the text featurizer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+class TextFeaturizer(object):
+    """Text featurizer, for processing or extracting features from text.
+    Currently, it only supports char-level tokenizing and conversion into
+    a list of token indices. Note that the token indexing order follows the
+    given vocabulary file.
+    :param vocab_filepath: Filepath to load vocabulary for token indices
+                           conversion.
+    :type specgram_type: basestring
+    """
+    def __init__(self, vocab_filepath):
+        self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
+            vocab_filepath)
+    def featurize(self, text):
+        """Convert text string to a list of token indices in char-level.Note
+        that the token indexing order follows the given vocabulary file.
+        :param text: Text to process.
+        :type text: basestring
+        :return: List of char-level token indices.
+        :rtype: list
+        """
+        tokens = self._char_tokenize(text)
+        return [self._vocab_dict[token] for token in tokens]
+    @property
+    def vocab_size(self):
+        """Return the vocabulary size.
+        :return: Vocabulary size.
+        :rtype: int
+        """
+        return len(self._vocab_list)
+    @property
+    def vocab_list(self):
+        """Return the vocabulary in list.
+        :return: Vocabulary in list.
+        :rtype: list
+        """
+        return self._vocab_list
+    def _char_tokenize(self, text):
+        """Character tokenizer."""
+        return list(text.strip())
+    def _load_vocabulary_from_file(self, vocab_filepath):
+        """Load vocabulary from file."""
+        vocab_lines = []
+        with open(vocab_filepath, 'r') as file:
+            vocab_lines.extend(file.readlines())
+        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_dict = dict(
+            [(token, id) for (id, token) in enumerate(vocab_list)])
+        return vocab_dict, vocab_list
--- a/deep_speech_2/data_utils/normalizer.py
+++ b/deep_speech_2/data_utils/normalizer.py
+"""Contains feature normalizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import random
+import data_utils.utils as utils
+from data_utils.audio import AudioSegment
+class FeatureNormalizer(object):
+    """Feature normalizer. Normalize features to be of zero mean and unit
+    stddev.
+    if mean_std_filepath is provided (not None), the normalizer will directly
+    initilize from the file. Otherwise, both manifest_path and featurize_func
+    should be given for on-the-fly mean and stddev computing.
+    :param mean_std_filepath: File containing the pre-computed mean and stddev.
+    :type mean_std_filepath: None|basestring
+    :param manifest_path: Manifest of instances for computing mean and stddev.
+    :type meanifest_path: None|basestring
+    :param featurize_func: Function to extract features. It should be callable
+                           with ``featurize_func(audio_segment)``.
+    :type featurize_func: None|callable
+    :param num_samples: Number of random samples for computing mean and stddev.
+    :type num_samples: int
+    :param random_seed: Random seed for sampling instances.
+    :type random_seed: int
+    :raises ValueError: If both mean_std_filepath and manifest_path
+                        (or both mean_std_filepath and featurize_func) are None.
+    """
+    def __init__(self,
+                 mean_std_filepath,
+                 manifest_path=None,
+                 featurize_func=None,
+                 num_samples=500,
+                 random_seed=0):
+        if not mean_std_filepath:
+            if not (manifest_path and featurize_func):
+                raise ValueError("If mean_std_filepath is None, meanifest_path "
+                                 "and featurize_func should not be None.")
+            self._rng = random.Random(random_seed)
+            self._compute_mean_std(manifest_path, featurize_func, num_samples)
+        else:
+            self._read_mean_std_from_file(mean_std_filepath)
+    def apply(self, features, eps=1e-14):
+        """Normalize features to be of zero mean and unit stddev.
+        :param features: Input features to be normalized.
+        :type features: ndarray
+        :param eps:  added to stddev to provide numerical stablibity.
+        :type eps: float
+        :return: Normalized features.
+        :rtype: ndarray
+        """
+        return (features - self._mean) / (self._std + eps)
+    def write_to_file(self, filepath):
+        """Write the mean and stddev to the file.
+        :param filepath: File to write mean and stddev.
+        :type filepath: basestring
+        """
+        np.savez(filepath, mean=self._mean, std=self._std)
+    def _read_mean_std_from_file(self, filepath):
+        """Load mean and std from file."""
+        npzfile = np.load(filepath)
+        self._mean = npzfile["mean"]
+        self._std = npzfile["std"]
+    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
+        """Compute mean and std from randomly sampled instances."""
+        manifest = utils.read_manifest(manifest_path)
+        sampled_manifest = self._rng.sample(manifest, num_samples)
+        features = []
+        for instance in sampled_manifest:
+            features.append(
+                featurize_func(
+                    AudioSegment.from_file(instance["audio_filepath"])))
+        features = np.hstack(features)
+        self._mean = np.mean(features, axis=1).reshape([-1, 1])
+        self._std = np.std(features, axis=1).reshape([-1, 1])
--- a/deep_speech_2/data_utils/speech.py
+++ b/deep_speech_2/data_utils/speech.py
+"""Contains the speech segment class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from data_utils.audio import AudioSegment
+class SpeechSegment(AudioSegment):
+    """Speech segment abstraction, a subclass of AudioSegment,
+    with an additional transcript.
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :param transcript: Transcript text for the speech.
+    :type transript: basestring
+    :raises TypeError: If the sample data type is not float or int.
+    """
+    def __init__(self, samples, sample_rate, transcript):
+        AudioSegment.__init__(self, samples, sample_rate)
+        self._transcript = transcript
+    def __eq__(self, other):
+        """Return whether two objects are equal.
+        """
+        if not AudioSegment.__eq__(self, other):
+            return False
+        if self._transcript != other._transcript:
+            return False
+        return True
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+    @classmethod
+    def from_file(cls, filepath, transcript):
+        """Create speech segment from audio file and corresponding transcript.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :param transcript: Transcript text for the speech.
+        :type transript: basestring
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        audio = AudioSegment.from_file(filepath)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def from_bytes(cls, bytes, transcript):
+        """Create speech segment from a byte string and corresponding
+        transcript.
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :param transcript: Transcript text for the speech.
+        :type transript: basestring
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        audio = AudioSegment.from_bytes(bytes)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
+        :param *segments: Input speech segments to be concatenated.
+        :type *segments: tuple of SpeechSegment
+        :return: Speech segment instance.
+        :rtype: SpeechSegment
+        :raises ValueError: If the number of segments is zero, or if the 
+                            sample_rate of any two segments does not match.
+        :raises TypeError: If any segment is not SpeechSegment instance.
+        """
+        if len(segments) == 0:
+            raise ValueError("No speech segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        transcripts = ""
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only speech segments of the same type "
+                                "instance can be concatenated.")
+            transcripts += seg._transcript
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate, transcripts)
+    @classmethod
+    def slice_from_file(cls, filepath, transcript, start=None, end=None):
+        """Loads a small section of an speech without having to load
+        the entire file into the memory which can be incredibly wasteful.
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function 
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :param transcript: Transcript text for the speech. if not provided, 
+                           the defaults is an empty string.
+        :type transript: basestring
+        :return: SpeechSegment instance of the specified slice of the input
+                 speech file.
+        :rtype: SpeechSegment
+        """
+        audio = Audiosegment.slice_from_file(filepath, start, end)
+        return cls(audio.samples, audio.sample_rate, transcript)
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent speech segment of the given duration and
+        sample rate, transcript will be an empty string.
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silence of the given duration.
+        :rtype: SpeechSegment
+        """
+        audio = AudioSegment.make_silence(duration, sample_rate)
+        return cls(audio.samples, audio.sample_rate, "")
+    @property
+    def transcript(self):
+        """Return the transcript text.
+        :return: Transcript text for the speech.
+        :rtype: basestring
+        """
+        return self._transcript
--- a/deep_speech_2/data_utils/utils.py
+++ b/deep_speech_2/data_utils/utils.py
+"""Contains data helper functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import json
+def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
+    """Load and parse manifest file.
+    Instances with durations outside [min_duration, max_duration] will be
+    filtered out.
+    :param manifest_path: Manifest file to load and parse. 
+    :type manifest_path: basestring
+    :param max_duration: Maximal duration in seconds for instance filter.
+    :type max_duration: float
+    :param min_duration: Minimal duration in seconds for instance filter.
+    :type min_duration: float
+    :return: Manifest parsing results. List of dict.
+    :rtype: list
+    :raises IOError: If failed to parse the manifest.
+    """
+    manifest = []
+    for json_line in open(manifest_path):
+        try:
+            json_data = json.loads(json_line)
+        except Exception as e:
+            raise IOError("Error reading manifest: %s" % str(e))
+        if (json_data["duration"] <= max_duration and
+                json_data["duration"] >= min_duration):
+            manifest.append(json_data)
+    return manifest
--- a/deep_speech_2/datasets/librispeech/librispeech.py
+++ b/deep_speech_2/datasets/librispeech/librispeech.py
+"""Prepare Librispeech ASR datasets.
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import os
+import wget
+import tarfile
+import argparse
+import soundfile
+import json
+from paddle.v2.dataset.common import md5file
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+URL_ROOT = "http://www.openslr.org/resources/12"
+URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
+URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
+URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
+URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
+URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
+URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
+URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"
+MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
+MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135"
+MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
+MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931"
+MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
+MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
+MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708"
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Libri",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--full_download",
+    default="True",
+    type=distutils.util.strtobool,
+    help="Download all datasets for Librispeech."
+    " If False, only download a minimal requirement (test-clean, dev-clean"
+    " train-clean-100). (default: %(default)s)")
+args = parser.parse_args()
+def download(url, md5sum, target_dir):
+    """
+    Download file from url to target_dir, and check md5sum.
+    """
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, url.split("/")[-1])
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        wget.download(url, target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+def unpack(filepath, target_dir):
+    """
+    Unpack the file to the target_dir.
+    """
+    print("Unpacking %s ..." % filepath)
+    tar = tarfile.open(filepath)
+    tar.extractall(target_dir)
+    tar.close()
+def create_manifest(data_dir, manifest_path):
+    """
+    Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        text_filelist = [
+            filename for filename in filelist if filename.endswith('trans.txt')
+        ]
+        if len(text_filelist) > 0:
+            text_filepath = os.path.join(data_dir, subfolder, text_filelist[0])
+            for line in open(text_filepath):
+                segments = line.strip().split()
+                text = ' '.join(segments[1:]).lower()
+                audio_filepath = os.path.join(data_dir, subfolder,
+                                              segments[0] + '.flac')
+                audio_data, samplerate = soundfile.read(audio_filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': audio_filepath,
+                        'duration': duration,
+                        'text': text
+                    }))
+    with open(manifest_path, 'w') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+def prepare_dataset(url, md5sum, target_dir, manifest_path):
+    """
+    Download, unpack and create summmary manifest file.
+    """
+    if not os.path.exists(os.path.join(target_dir, "LibriSpeech")):
+        # download
+        filepath = download(url, md5sum, target_dir)
+        # unpack
+        unpack(filepath, target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+def main():
+    prepare_dataset(
+        url=URL_TEST_CLEAN,
+        md5sum=MD5_TEST_CLEAN,
+        target_dir=os.path.join(args.target_dir, "test-clean"),
+        manifest_path=args.manifest_prefix + ".test-clean")
+    prepare_dataset(
+        url=URL_DEV_CLEAN,
+        md5sum=MD5_DEV_CLEAN,
+        target_dir=os.path.join(args.target_dir, "dev-clean"),
+        manifest_path=args.manifest_prefix + ".dev-clean")
+    prepare_dataset(
+        url=URL_TRAIN_CLEAN_100,
+        md5sum=MD5_TRAIN_CLEAN_100,
+        target_dir=os.path.join(args.target_dir, "train-clean-100"),
+        manifest_path=args.manifest_prefix + ".train-clean-100")
+    if args.full_download:
+        prepare_dataset(
+            url=URL_TEST_OTHER,
+            md5sum=MD5_TEST_OTHER,
+            target_dir=os.path.join(args.target_dir, "test-other"),
+            manifest_path=args.manifest_prefix + ".test-other")
+        prepare_dataset(
+            url=URL_DEV_OTHER,
+            md5sum=MD5_DEV_OTHER,
+            target_dir=os.path.join(args.target_dir, "dev-other"),
+            manifest_path=args.manifest_prefix + ".dev-other")
+        prepare_dataset(
+            url=URL_TRAIN_CLEAN_360,
+            md5sum=MD5_TRAIN_CLEAN_360,
+            target_dir=os.path.join(args.target_dir, "train-clean-360"),
+            manifest_path=args.manifest_prefix + ".train-clean-360")
+        prepare_dataset(
+            url=URL_TRAIN_OTHER_500,
+            md5sum=MD5_TRAIN_OTHER_500,
+            target_dir=os.path.join(args.target_dir, "train-other-500"),
+            manifest_path=args.manifest_prefix + ".train-other-500")
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/datasets/run_all.sh
+++ b/deep_speech_2/datasets/run_all.sh
+cd librispeech
+python librispeech.py
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+cd -
+cat librispeech/manifest.train* | shuf > manifest.train
+cat librispeech/manifest.dev-clean > manifest.dev
+cat librispeech/manifest.test-clean > manifest.test
+echo "All done."
--- a/deep_speech_2/datasets/vocab/eng_vocab.txt
+++ b/deep_speech_2/datasets/vocab/eng_vocab.txt
+'
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
--- a/deep_speech_2/decoder.py
+++ b/deep_speech_2/decoder.py
+"""Contains various CTC decoder."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+from itertools import groupby
+def ctc_best_path_decode(probs_seq, vocabulary):
+    """Best path decoding, also called argmax decoding or greedy decoding.
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+def ctc_decode(probs_seq, vocabulary, method):
+    """CTC-like sequence decoding from a sequence of likelihood probablilites.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param method: Decoding method name, with options: "best_path".
+    :type method: basestring
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("probs dimension mismatchedd with vocabulary")
+    if method == "best_path":
+        return ctc_best_path_decode(probs_seq, vocabulary)
+    else:
+        raise ValueError("Decoding method [%s] is not supported.")
--- a/deep_speech_2/error_rate.py
+++ b/deep_speech_2/error_rate.py
+# -*- coding: utf-8 -*-
+"""This module provides functions to calculate error rate in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def _levenshtein_distance(ref, hyp):
+    """Levenshtein distance is a string metric for measuring the difference between
+    two sequences. Informally, the levenshtein disctance is defined as the minimum
+    number of single-character edits (substitutions, insertions or deletions) 
+    required to change one word into the other. We can naturally extend the edits to 
+    word level when calculate levenshtein disctance for two sentences.
+    """
+    ref_len = len(ref)
+    hyp_len = len(hyp)
+    # special case
+    if ref == hyp:
+        return 0
+    if ref_len == 0:
+        return hyp_len
+    if hyp_len == 0:
+        return ref_len
+    distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
+    # initialize distance matrix
+    for j in xrange(hyp_len + 1):
+        distance[0][j] = j
+    for i in xrange(ref_len + 1):
+        distance[i][0] = i
+    # calculate levenshtein distance
+    for i in xrange(1, ref_len + 1):
+        for j in xrange(1, hyp_len + 1):
+            if ref[i - 1] == hyp[j - 1]:
+                distance[i][j] = distance[i - 1][j - 1]
+            else:
+                s_num = distance[i - 1][j - 1] + 1
+                i_num = distance[i][j - 1] + 1
+                d_num = distance[i - 1][j] + 1
+                distance[i][j] = min(s_num, i_num, d_num)
+    return distance[ref_len][hyp_len]
+def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
+    """Calculate word error rate (WER). WER compares reference text and 
+    hypothesis text in word-level. WER is defined as:
+    .. math::
+        WER = (Sw + Dw + Iw) / Nw
+    where
+    .. code-block:: text
+        Sw is the number of words subsituted,
+        Dw is the number of words deleted,
+        Iw is the number of words inserted,
+        Nw is the number of words in the reference
+    We can use levenshtein distance to calculate WER. Please draw an attention that 
+    empty items will be removed when splitting sentences by delimiter.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :param delimiter: Delimiter of input sentences.
+    :type delimiter: char
+    :return: Word error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+    ref_words = filter(None, reference.split(delimiter))
+    hyp_words = filter(None, hypothesis.split(delimiter))
+    if len(ref_words) == 0:
+        raise ValueError("Reference's word number should be greater than 0.")
+    edit_distance = _levenshtein_distance(ref_words, hyp_words)
+    wer = float(edit_distance) / len(ref_words)
+    return wer
+def cer(reference, hypothesis, ignore_case=False):
+    """Calculate charactor error rate (CER). CER compares reference text and
+    hypothesis text in char-level. CER is defined as:
+    .. math::
+        CER = (Sc + Dc + Ic) / Nc
+    where
+    .. code-block:: text
+        Sc is the number of characters substituted,
+        Dc is the number of characters deleted,
+        Ic is the number of characters inserted
+        Nc is the number of characters in the reference
+    We can use levenshtein distance to calculate CER. Chinese input should be 
+    encoded to unicode. Please draw an attention that the leading and tailing 
+    white space characters will be truncated and multiple consecutive white 
+    space characters in a sentence will be replaced by one white space character.
+    :param reference: The reference sentence.
+    :type reference: basestring
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: basestring
+    :param ignore_case: Whether case-sensitive or not.
+    :type ignore_case: bool
+    :return: Character error rate.
+    :rtype: float
+    :raises ValueError: If the reference length is zero.
+    """
+    if ignore_case == True:
+        reference = reference.lower()
+        hypothesis = hypothesis.lower()
+    reference = ' '.join(filter(None, reference.split(' ')))
+    hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
+    if len(reference) == 0:
+        raise ValueError("Length of reference should be greater than 0.")
+    edit_distance = _levenshtein_distance(reference, hypothesis)
+    cer = float(edit_distance) / len(reference)
+    return cer
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
+"""Inferer for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import argparse
+import gzip
+import distutils.util
+import multiprocessing
+import paddle.v2 as paddle
+from data_utils.data import DataGenerator
+from model import deep_speech2
+from decoder import ctc_decode
+import utils
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--num_samples",
+    default=10,
+    type=int,
+    help="Number of samples for inference. (default: %(default)s)")
+parser.add_argument(
+    "--num_conv_layers",
+    default=2,
+    type=int,
+    help="Convolution layer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_rnn_layers",
+    default=3,
+    type=int,
+    help="RNN layer number. (default: %(default)s)")
+parser.add_argument(
+    "--rnn_layer_size",
+    default=512,
+    type=int,
+    help="RNN layer cell number. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
+parser.add_argument(
+    "--mean_std_filepath",
+    default='mean_std.npz',
+    type=str,
+    help="Manifest path for normalizer. (default: %(default)s)")
+parser.add_argument(
+    "--decode_manifest_path",
+    default='datasets/manifest.test',
+    type=str,
+    help="Manifest path for decoding. (default: %(default)s)")
+parser.add_argument(
+    "--model_filepath",
+    default='checkpoints/params.latest.tar.gz',
+    type=str,
+    help="Model filepath. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='datasets/vocab/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
+args = parser.parse_args()
+def infer():
+    """Max-ctc-decoding for DeepSpeech2."""
+    # initialize data generator
+    data_generator = DataGenerator(
+        vocab_filepath=args.vocab_filepath,
+        mean_std_filepath=args.mean_std_filepath,
+        augmentation_config='{}',
+        num_threads=args.num_threads_data)
+    # create network config
+    # paddle.data_type.dense_array is used for variable batch input.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
+    audio_data = paddle.layer.data(
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
+    text_data = paddle.layer.data(
+        name="transcript_text",
+        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
+    output_probs = deep_speech2(
+        audio_data=audio_data,
+        text_data=text_data,
+        dict_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_size=args.rnn_layer_size,
+        is_inference=True)
+    # load parameters
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(args.model_filepath))
+    # prepare infer data
+    batch_reader = data_generator.batch_reader_creator(
+        manifest_path=args.decode_manifest_path,
+        batch_size=args.num_samples,
+        sortagrad=False,
+        shuffle_method=None)
+    infer_data = batch_reader().next()
+    # run inference
+    infer_results = paddle.infer(
+        output_layer=output_probs, parameters=parameters, input=infer_data)
+    num_steps = len(infer_results) // len(infer_data)
+    probs_split = [
+        infer_results[i * num_steps:(i + 1) * num_steps]
+        for i in xrange(len(infer_data))
+    ]
+    # decode and print
+    for i, probs in enumerate(probs_split):
+        output_transcription = ctc_decode(
+            probs_seq=probs,
+            vocabulary=data_generator.vocab_list,
+            method="best_path")
+        target_transcription = ''.join(
+            [data_generator.vocab_list[index] for index in infer_data[i][1]])
+        print("Target Transcription: %s \nOutput Transcription: %s \n" %
+              (target_transcription, output_transcription))
+def main():
+    utils.print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
+    infer()
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/model.py
+++ b/deep_speech_2/model.py
+"""Contains DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.v2 as paddle
+def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
+                  padding, act):
+    """
+    Convolution layer with batch normalization.
+    """
+    conv_layer = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=num_channels_in,
+        num_filters=num_channels_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=conv_layer, act=act)
+def bidirectional_simple_rnn_bn_layer(name, input, size, act):
+    """
+    Bidirectonal simple rnn layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+    """
+    # input-hidden weights shared across bi-direcitonal rnn.
+    input_proj = paddle.layer.fc(
+        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
+    # batch norm is only performed on input-state projection 
+    input_proj_bn = paddle.layer.batch_norm(
+        input=input_proj, act=paddle.activation.Linear())
+    # forward and backward in time
+    forward_simple_rnn = paddle.layer.recurrent(
+        input=input_proj_bn, act=act, reverse=False)
+    backward_simple_rnn = paddle.layer.recurrent(
+        input=input_proj_bn, act=act, reverse=True)
+    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
+def conv_group(input, num_stacks):
+    """
+    Convolution group with several stacking convolution layers.
+    """
+    conv = conv_bn_layer(
+        input=input,
+        filter_size=(11, 41),
+        num_channels_in=1,
+        num_channels_out=32,
+        stride=(3, 2),
+        padding=(5, 20),
+        act=paddle.activation.BRelu())
+    for i in xrange(num_stacks - 1):
+        conv = conv_bn_layer(
+            input=conv,
+            filter_size=(11, 21),
+            num_channels_in=32,
+            num_channels_out=32,
+            stride=(1, 2),
+            padding=(5, 10),
+            act=paddle.activation.BRelu())
+    output_num_channels = 32
+    output_height = 160 // pow(2, num_stacks) + 1
+    return conv, output_num_channels, output_height
+def rnn_group(input, size, num_stacks):
+    """
+    RNN group with several stacking RNN layers.
+    """
+    output = input
+    for i in xrange(num_stacks):
+        output = bidirectional_simple_rnn_bn_layer(
+            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
+    return output
+def deep_speech2(audio_data,
+                 text_data,
+                 dict_size,
+                 num_conv_layers=2,
+                 num_rnn_layers=3,
+                 rnn_size=256,
+                 is_inference=False):
+    """
+    The whole DeepSpeech2 model structure (a simplified version).
+    :param audio_data: Audio spectrogram data layer.
+    :type audio_data: LayerOutput
+    :param text_data: Transcription text data layer.
+    :type text_data: LayerOutput
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (number of RNN cells).
+    :type rnn_size: int
+    :param is_inference: False in the training mode, and True in the
+                         inferene mode.
+    :type is_inference: bool
+    :return: If is_inference set False, return a ctc cost layer;
+             if is_inference set True, return a sequence layer of output
+             probability distribution.
+    :rtype: tuple of LayerOutput
+    """
+    # convolution group
+    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
+        input=audio_data, num_stacks=num_conv_layers)
+    # convert data form convolution feature map to sequence of vectors
+    conv2seq = paddle.layer.block_expand(
+        input=conv_group_output,
+        num_channels=conv_group_num_channels,
+        stride_x=1,
+        stride_y=1,
+        block_x=1,
+        block_y=conv_group_height)
+    # rnn group
+    rnn_group_output = rnn_group(
+        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
+    fc = paddle.layer.fc(
+        input=rnn_group_output,
+        size=dict_size + 1,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+    if is_inference:
+        # probability distribution with softmax
+        return paddle.layer.mixed(
+            input=paddle.layer.identity_projection(input=fc),
+            act=paddle.activation.Softmax())
+    else:
+        # ctc cost
+        return paddle.layer.warp_ctc(
+            input=fc,
+            label=text_data,
+            size=dict_size + 1,
+            blank=dict_size,
+            norm_by_times=True)
--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
+wget==3.2
+scipy==0.13.1
+resampy==0.1.5
\ No newline at end of file
--- a/deep_speech_2/setup.sh
+++ b/deep_speech_2/setup.sh
+#!/bin/bash
+# install python dependencies
+if [ -f "requirements.txt" ]; then
+    pip install -r requirements.txt
+fi
+if [ $? != 0 ]; then
+    echo "Install python dependencies failed !!!"
+    exit 1
+fi
+# install package Soundfile
+curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+if [ $? != 0 ]; then
+    echo "Download libsndfile-1.0.28.tar.gz failed !!!"
+    exit 1
+fi
+tar -zxvf libsndfile-1.0.28.tar.gz
+cd libsndfile-1.0.28
+./configure && make && make install
+cd -
+rm -rf libsndfile-1.0.28
+rm libsndfile-1.0.28.tar.gz
+pip install SoundFile==0.9.0.post1
+if [ $? != 0 ]; then
+    echo "Install SoundFile failed !!!"
+    exit 1
+fi
+# prepare ./checkpoints
+mkdir checkpoints
+echo "Install all dependencies successfully."
--- a/deep_speech_2/tests/test_error_rate.py
+++ b/deep_speech_2/tests/test_error_rate.py
+# -*- coding: utf-8 -*-
+"""Test error rate."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import unittest
+import error_rate
+class TestParse(unittest.TestCase):
+    def test_wer_1(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
+    def test_wer_2(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        word_error_rate = error_rate.wer(ref, ref)
+        self.assertEqual(word_error_rate, 0.0)
+    def test_wer_3(self):
+        ref = ' '
+        hyp = 'Hypothesis sentence'
+        with self.assertRaises(ValueError):
+            word_error_rate = error_rate.wer(ref, hyp)
+    def test_cer_1(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
+    def test_cer_2(self):
+        ref = 'werewolf'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertEqual(char_error_rate, 0.0)
+    def test_cer_3(self):
+        ref = u'我是中国人'
+        hyp = u'我是 美洲人'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
+    def test_cer_4(self):
+        ref = u'我是中国人'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertFalse(char_error_rate, 0.0)
+    def test_cer_5(self):
+        ref = ''
+        hyp = 'Hypothesis'
+        with self.assertRaises(ValueError):
+            char_error_rate = error_rate.cer(ref, hyp)
+if __name__ == '__main__':
+    unittest.main()
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
+"""Trainer for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import sys
+import os
+import argparse
+import gzip
+import time
+import distutils.util
+import multiprocessing
+import paddle.v2 as paddle
+from model import deep_speech2
+from data_utils.data import DataGenerator
+import utils
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--batch_size", default=256, type=int, help="Minibatch size.")
+parser.add_argument(
+    "--num_passes",
+    default=200,
+    type=int,
+    help="Training pass number. (default: %(default)s)")
+parser.add_argument(
+    "--num_conv_layers",
+    default=2,
+    type=int,
+    help="Convolution layer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_rnn_layers",
+    default=3,
+    type=int,
+    help="RNN layer number. (default: %(default)s)")
+parser.add_argument(
+    "--rnn_layer_size",
+    default=512,
+    type=int,
+    help="RNN layer cell number. (default: %(default)s)")
+parser.add_argument(
+    "--adam_learning_rate",
+    default=5e-4,
+    type=float,
+    help="Learning rate for ADAM Optimizer. (default: %(default)s)")
+parser.add_argument(
+    "--use_gpu",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use gpu or not. (default: %(default)s)")
+parser.add_argument(
+    "--use_sortagrad",
+    default=True,
+    type=distutils.util.strtobool,
+    help="Use sortagrad or not. (default: %(default)s)")
+parser.add_argument(
+    "--max_duration",
+    default=27.0,
+    type=float,
+    help="Audios with duration larger than this will be discarded. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--min_duration",
+    default=0.0,
+    type=float,
+    help="Audios with duration smaller than this will be discarded. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--shuffle_method",
+    default='batch_shuffle_clipped',
+    type=str,
+    help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
+    "'batch_shuffle_batch'. (default: %(default)s)")
+parser.add_argument(
+    "--trainer_count",
+    default=8,
+    type=int,
+    help="Trainer number. (default: %(default)s)")
+parser.add_argument(
+    "--num_threads_data",
+    default=multiprocessing.cpu_count(),
+    type=int,
+    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
+parser.add_argument(
+    "--mean_std_filepath",
+    default='mean_std.npz',
+    type=str,
+    help="Manifest path for normalizer. (default: %(default)s)")
+parser.add_argument(
+    "--train_manifest_path",
+    default='datasets/manifest.train',
+    type=str,
+    help="Manifest path for training. (default: %(default)s)")
+parser.add_argument(
+    "--dev_manifest_path",
+    default='datasets/manifest.dev',
+    type=str,
+    help="Manifest path for validation. (default: %(default)s)")
+parser.add_argument(
+    "--vocab_filepath",
+    default='datasets/vocab/eng_vocab.txt',
+    type=str,
+    help="Vocabulary filepath. (default: %(default)s)")
+parser.add_argument(
+    "--init_model_path",
+    default=None,
+    type=str,
+    help="If set None, the training will start from scratch. "
+    "Otherwise, the training will resume from "
+    "the existing model of this path. (default: %(default)s)")
+parser.add_argument(
+    "--augmentation_config",
+    default='[{"type": "shift", '
+    '"params": {"min_shift_ms": -5, "max_shift_ms": 5},'
+    '"prob": 1.0}]',
+    type=str,
+    help="Augmentation configuration in json-format. "
+    "(default: %(default)s)")
+args = parser.parse_args()
+def train():
+    """DeepSpeech2 training."""
+    # initialize data generator
+    def data_generator():
+        return DataGenerator(
+            vocab_filepath=args.vocab_filepath,
+            mean_std_filepath=args.mean_std_filepath,
+            augmentation_config=args.augmentation_config,
+            max_duration=args.max_duration,
+            min_duration=args.min_duration,
+            num_threads=args.num_threads_data)
+    train_generator = data_generator()
+    test_generator = data_generator()
+    # create network config
+    # paddle.data_type.dense_array is used for variable batch input.
+    # The size 161 * 161 is only an placeholder value and the real shape
+    # of input batch data will be induced during training.
+    audio_data = paddle.layer.data(
+        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
+    text_data = paddle.layer.data(
+        name="transcript_text",
+        type=paddle.data_type.integer_value_sequence(
+            train_generator.vocab_size))
+    cost = deep_speech2(
+        audio_data=audio_data,
+        text_data=text_data,
+        dict_size=train_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_size=args.rnn_layer_size,
+        is_inference=False)
+    # create/load parameters and optimizer
+    if args.init_model_path is None:
+        parameters = paddle.parameters.create(cost)
+    else:
+        if not os.path.isfile(args.init_model_path):
+            raise IOError("Invalid model!")
+        parameters = paddle.parameters.Parameters.from_tar(
+            gzip.open(args.init_model_path))
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+    # prepare data reader
+    train_batch_reader = train_generator.batch_reader_creator(
+        manifest_path=args.train_manifest_path,
+        batch_size=args.batch_size,
+        min_batch_size=args.trainer_count,
+        sortagrad=args.use_sortagrad if args.init_model_path is None else False,
+        shuffle_method=args.shuffle_method)
+    test_batch_reader = test_generator.batch_reader_creator(
+        manifest_path=args.dev_manifest_path,
+        batch_size=args.batch_size,
+        min_batch_size=1,  # must be 1, but will have errors.
+        sortagrad=False,
+        shuffle_method=None)
+    # create event handler
+    def event_handler(event):
+        global start_time, cost_sum, cost_counter
+        if isinstance(event, paddle.event.EndIteration):
+            cost_sum += event.cost
+            cost_counter += 1
+            if (event.batch_id + 1) % 100 == 0:
+                print("\nPass: %d, Batch: %d, TrainCost: %f" % (
+                    event.pass_id, event.batch_id + 1, cost_sum / cost_counter))
+                cost_sum, cost_counter = 0.0, 0
+                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
+                    parameters.to_tar(f)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.BeginPass):
+            start_time = time.time()
+            cost_sum, cost_counter = 0.0, 0
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=test_batch_reader, feeding=test_generator.feeding)
+            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
+                  (time.time() - start_time, event.pass_id, result.cost))
+            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                parameters.to_tar(f)
+    # run train
+    trainer.train(
+        reader=train_batch_reader,
+        event_handler=event_handler,
+        num_passes=args.num_passes,
+        feeding=train_generator.feeding)
+def main():
+    utils.print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+    train()
+if __name__ == '__main__':
+    main()
--- a/deep_speech_2/utils.py
+++ b/deep_speech_2/utils.py
+"""Contains common utility functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+def print_arguments(args):
+    """Print argparse's arguments.
+    Usage:
+    .. code-block:: python
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args() 
+        print_arguments(args)
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----  Configuration Arguments -----")
+    for arg, value in vars(args).iteritems():
+        print("%s: %s" % (arg, value))
+    print("------------------------------------")
--- a/generate_sequence_by_rnn_lm/.gitignore
+++ b/generate_sequence_by_rnn_lm/.gitignore
+*.pyc
+*.tar.gz
+models
--- a/generate_sequence_by_rnn_lm/README.md
+++ b/generate_sequence_by_rnn_lm/README.md
+# 使用循环神经网语言模型生成文本
+语言模型(Language Model)是一个概率分布模型，简单来说，就是用来计算一个句子的概率的模型。利用它可以确定哪个词序列的可能性更大，或者给定若干个词，可以预测下一个最可能出现的词。语言模型是自然语言处理领域里一个重要的基础模型。
+## 应用场景
+**语言模型被应用在很多领域**，如：
+* **自动写作**：语言模型可以根据上文生成下一个词，递归下去可以生成整个句子、段落、篇章。
+* **QA**：语言模型可以根据Question生成Answer。
+* **机器翻译**：当前主流的机器翻译模型大多基于Encoder-Decoder模式，其中Decoder就是一个待条件的语言模型，用来生成目标语言。
+* **拼写检查**：语言模型可以计算出词序列的概率，一般在拼写错误处序列的概率会骤减，可以用来识别拼写错误并提供改正候选集。
+* **词性标注、句法分析、语音识别......**
+## 关于本例
+本例实现基于RNN的语言模型，以及利用语言模型生成文本，本例的目录结构如下：
+```text
+.
+├── data
+│   └── train_data_examples.txt        # 示例数据，可参考示例数据的格式，提供自己的数据
+├── config.py    # 配置文件，包括data、train、infer相关配置
+├── generate.py  # 预测任务脚本，即生成文本
+├── beam_search.py    # beam search 算法实现
+├── network_conf.py   # 本例中涉及的各种网络结构均定义在此文件中，希望进一步修改模型结构，请修改此文件
+├── reader.py    # 读取数据接口
+├── README.md
+├── train.py    # 训练任务脚本
+└── utils.py    # 定义通用的函数，例如：构建字典、加载字典等
+```
+## RNN 语言模型
+### 简介
+RNN是一个序列模型，基本思路是：在时刻$t$，将前一时刻$t-1$的隐藏层输出和$t$时刻的词向量一起输入到隐藏层从而得到时刻$t$的特征表示，然后用这个特征表示得到$t$时刻的预测输出，如此在时间维上递归下去。可以看出RNN善于使用上文信息、历史知识，具有“记忆”功能。理论上RNN能实现“长依赖”（即利用很久之前的知识），但在实际应用中发现效果并不理想，研究提出了LSTM和GRU等变种，通过引入门机制对传统RNN的记忆单元进行了改进，弥补了传统RNN在学习长序列时遇到的难题。本例模型使用了LSTM或GRU，可通过配置进行修改。下图是RNN（广义上包含了LSTM、GRU等）语言模型“循环”思想的示意图：
+<p align=center><img src='images/rnn.png' width='500px'/></p>
+### 模型实现
+本例中RNN语言模型的实现简介如下：
+- **定义模型参数**：`config.py`中定义了模型的参数变量。
+- **定义模型结构**：`network_conf.py`中的`rnn_lm`**函数**中定义了模型的**结构**，如下：
+    - 输入层：将输入的词（或字）序列映射成向量，即词向量层： `embedding`。
+    - 中间层：根据配置实现RNN层，将上一步得到的`embedding`向量序列作为输入。
+    - 输出层：使用`softmax`归一化计算单词的概率。
+    - loss：定义多类交叉熵作为模型的损失函数。
+- **训练模型**：`train.py`中的`main`方法实现了模型的训练，实现流程如下：
+    - 准备输入数据：建立并保存词典、构建train和test数据的reader。
+    - 初始化模型：包括模型的结构、参数。
+    - 构建训练器：demo中使用的是Adam优化算法。
+    - 定义回调函数：构建`event_handler`来跟踪训练过程中loss的变化，并在每轮训练结束时保存模型的参数。
+    - 训练：使用trainer训练模型。
+- **生成文本**：`generate.py` 实现了文本的生成，实现流程如下：
+    - 加载训练好的模型和词典文件。
+    - 读取`gen_file`文件，每行是一个句子的前缀，用[柱搜索算法(Beam Search)](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md#柱搜索算法)根据前缀生成文本。
+    - 将生成的文本及其前缀保存到文件`gen_result`。
+## 使用说明
+运行本例的方法如下：
+* 1，运行`python train.py`命令，开始train模型（默认使用RNN），待训练结束。
+* 2，运行`python generate.py`运行文本生成。（输入的文本默认为`data/train_data_examples.txt`，生成的文本默认保存到`data/gen_result.txt`中。）
+**如果需要使用自己的语料、定制模型，需要修改`config.py`中的配置，细节和适配工作详情如下：**
+### 语料适配
+* 清洗语料：去除原文中空格、tab、乱码，按需去除数字、标点符号、特殊符号等。
+* 内容格式：每个句子占一行；每行中的各词之间使用一个空格符分开。
+* 按需要配置`config.py`中的如下参数：
+    ```python  
+     train_file = "data/train_data_examples.txt"
+     test_file = ""
+     vocab_file = "data/word_vocab.txt"
+     model_save_dir = "models"
+    ```
+    1. `train_file`：指定训练数据的路径，**需要预先分词**。
+    2. `test_file`：指定测试数据的路径，如果训练数据不为空，将在每个 `pass` 训练结束对指定的测试数据进行测试。
+    3. `vocab_file`：指定字典的路径，如果字典文件不存在，将会对训练语料进行词频统计，构建字典。
+    4. `model_save_dir`：指定模型保存的路径，如果指定的文件夹不存在，将会自动创建。
+### 构建字典的策略
+- 当指定的字典文件不存在时，将对训练数据进行词频统计，自动构建字典`config.py` 中有如下两个参数与构建字典有关：
+    ```python
+    max_word_num = 51200 - 2
+    cutoff_word_fre = 0
+    ```
+    1. `max_word_num`：指定字典中含有多少个词。
+    2. `cutoff_word_fre`：字典中词语在训练语料中出现的最低频率。
+- 加入指定了 `max_word_num = 5000`，并且 `cutoff_word_fre = 10`，词频统计发现训练语料中出现频率高于10次的词语仅有3000个，那么最终会取3000个词构成词典。
+- 构建词典时，会自动加入两个特殊符号：
+    1. `<unk>`：不出现在字典中的词
+    2. `<e>`：句子的结束符
+    *注：需要注意的是，词典越大生成的内容越丰富，但训练耗时越久。一般中文分词之后，语料中不同的词能有几万乃至几十万，如果`max_word_num`取值过小则导致`<unk>`占比过高，如果`max_word_num`取值较大，则严重影响训练速度（对精度也有影响）。所以，也有“按字”训练模型的方式，即：把每个汉字当做一个词，常用汉字也就几千个，使得字典的大小不会太大、不会丢失太多信息，但汉语中同一个字在不同词中语义相差很大，有时导致模型效果不理想。建议多试试、根据实际情况选择是“按词训练”还是“按字训练”。*
+### 模型适配、训练
+* 按需调整`config.py`中如下配置，来修改 rnn 语言模型的网络结果：
+    ```python
+    rnn_type = "lstm"  # "gru" or "lstm"
+    emb_dim = 256
+    hidden_size = 256
+    stacked_rnn_num = 2
+    ```
+    1. `rnn_type`：支持 ”gru“ 或者 ”lstm“ 两种参数，选择使用何种 RNN 单元。
+    2. `emb_dim`：设置词向量的维度。
+    3. `hidden_size`：设置 RNN 单元隐层大小。
+    4. `stacked_rnn_num`：设置堆叠 RNN 单元的个数，构成一个更深的模型。
+* 运行`python train.py`命令训练模型，模型将被保存到`model_save_dir`指定的目录。
+### 按需生成文本
+* 按需调整`config.py`中以下变量，详解如下：
+    ```python
+    gen_file = "data/train_data_examples.txt"
+    gen_result = "data/gen_result.txt"
+    max_gen_len = 25  # the max number of words to generate
+    beam_size = 5
+    model_path = "models/rnn_lm_pass_00000.tar.gz"
+    ```
+    1. `gen_file`：指定输入数据文件，每行是一个句子的前缀，**需要预先分词**。
+    2. `gen_result`：指定输出文件路径，生成结果将写入此文件。
+    3. `max_gen_len`：指定每一句生成的话最长长度，如果模型无法生成出`<e>`，当生成 `max_gen_len` 个词语后，生成过程会自动终止。
+    4. `beam_size`：Beam Search 算法每一步的展开宽度。
+    5. `model_path`：指定训练好的模型的路径。
+    其中，`gen_file` 中保存的是待生成的文本前缀，每个前缀占一行，形如：
+    ```text
+    若隐若现 地像 幽灵 , 像 死神
+    ```
+    将需要生成的文本前缀按此格式存入文件即可；
+* 运行`python generate.py`命令运行beam search 算法为输入前缀生成文本，下面是模型生成的结果：
+    ```text
+    81    若隐若现 地像 幽灵 , 像 死神
+    -12.2542    一样 。 他 是 个 怪物 <e>
+    -12.6889    一样 。 他 是 个 英雄 <e>
+    -13.9877    一样 。 他 是 我 的 敌人 <e>
+    -14.2741    一样 。 他 是 我 的 <e>
+    -14.6250    一样 。 他 是 我 的 朋友 <e>
+    ```
+    其中：
+    1. 第一行 `81    若隐若现 地像 幽灵 , 像 死神`以`\t`为分隔，共有两列：
+        - 第一列是输入前缀在训练样本集中的序号。
+        - 第二列是输入的前缀。
+    2. 第二 ~ `beam_size + 1` 行是生成结果，同样以 `\t` 分隔为两列：
+        - 第一列是该生成序列的对数概率（log probability）。
+        - 第二列是生成的文本序列，正常的生成结果会以符号`<e>`结尾，如果没有以`<e>`结尾，意味着超过了最大序列长度，生成强制终止。
--- a/generate_sequence_by_rnn_lm/beam_search.py
+++ b/generate_sequence_by_rnn_lm/beam_search.py
--- a/generate_sequence_by_rnn_lm/config.py
+++ b/generate_sequence_by_rnn_lm/config.py
+#!/usr/bin/env python
+# coding=utf-8
+import os
+################## for building word dictionary  ##################
+max_word_num = 51200 - 2
+cutoff_word_fre = 0
+################## for training task  #########################
+# path of training data
+train_file = "data/train_data_examples.txt"
+# path of testing data, if testing file does not exist,
+# testing will not be performed at the end of each training pass
+test_file = ""
+# path of word dictionary, if this file does not exist,
+# word dictionary will be built from training data.
+vocab_file = "data/word_vocab.txt"
+# directory to save the trained model
+# create a new directory if the directoy does not exist
+model_save_dir = "models"
+batch_size = 32  # the number of training examples in one forward/backward pass
+num_passes = 20  # how many passes to train the model
+log_period = 50
+save_period_by_batches = 50
+use_gpu = True  # to use gpu or not
+trainer_count = 1  # number of trainer
+##################  for model configuration  ##################
+rnn_type = "lstm"  # "gru" or "lstm"
+emb_dim = 256
+hidden_size = 256
+stacked_rnn_num = 2
+##################  for text generation  ##################
+gen_file = "data/train_data_examples.txt"
+gen_result = "data/gen_result.txt"
+max_gen_len = 25  # the max number of words to generate
+beam_size = 5
+model_path = "models/rnn_lm_pass_00000.tar.gz"
+if not os.path.exists(model_save_dir):
+    os.mkdir(model_save_dir)
--- a/generate_sequence_by_rnn_lm/data/train_data_examples.txt
+++ b/generate_sequence_by_rnn_lm/data/train_data_examples.txt
+我们 不会 伤害 你 的 。 他们 也 这么 说 。
+你 拥有 你 父亲 皇室 的 血统 。 是 合法 的 继承人 。
+叫 什么 你 可以 告诉 我 。
+你 并 没有 留言 说 要 去 哪里 。 是 的 , 因为 我 必须 要 去 完成 这件 事 。
+你 查出 是 谁 住 在 隔壁 房间 吗 ?
--- a/generate_sequence_by_rnn_lm/generate.py
+++ b/generate_sequence_by_rnn_lm/generate.py
+# coding=utf-8
+import os
+import gzip
+import numpy as np
+import paddle.v2 as paddle
+from utils import logger, load_dict
+from beam_search import BeamSearch
+import config as conf
+from network_conf import rnn_lm
+def rnn_generate(gen_input_file, model_path, max_gen_len, beam_size,
+                 word_dict_file):
+    """
+    use RNN model to generate sequences.
+    :param word_id_dict: vocab.
+    :type word_id_dict: dictionary with content of "{word, id}",
+                        "word" is string type , "id" is int type.
+    :param num_words: the number of the words to generate.
+    :type num_words: int
+    :param beam_size: beam width.
+    :type beam_size: int
+    :return: save prediction results to output_file
+    """
+    assert os.path.exists(gen_input_file), "test file does not exist!"
+    assert os.path.exists(model_path), "trained model does not exist!"
+    assert os.path.exists(
+        word_dict_file), "word dictionary file does not exist!"
+    # load word dictionary
+    word_2_ids = load_dict(word_dict_file)
+    try:
+        UNK_ID = word_2_ids["<unk>"]
+    except KeyError:
+        logger.fatal("the word dictionary must contain a <unk> token!")
+        sys.exit(-1)
+    # initialize paddle
+    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
+    # load the trained model
+    pred_words = rnn_lm(
+        len(word_2_ids),
+        conf.emb_dim,
+        conf.hidden_size,
+        conf.stacked_rnn_num,
+        conf.rnn_type,
+        is_infer=True)
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(model_path, "r"))
+    inferer = paddle.inference.Inference(
+        output_layer=pred_words, parameters=parameters)
+    generator = BeamSearch(inferer, word_dict_file, beam_size, max_gen_len)
+    # generate text
+    with open(conf.gen_file, "r") as fin, open(conf.gen_result, "w") as fout:
+        for idx, line in enumerate(fin):
+            fout.write("%d\t%s" % (idx, line))
+            for gen_res in generator.gen_a_sentence([
+                    word_2_ids.get(w, UNK_ID)
+                    for w in line.lower().strip().split()
+            ]):
+                fout.write("%s\n" % gen_res)
+            fout.write("\n")
+if __name__ == "__main__":
+    rnn_generate(conf.gen_file, conf.model_path, conf.max_gen_len,
+                 conf.beam_size, conf.vocab_file)
--- a/generate_sequence_by_rnn_lm/images/ngram.png
+++ b/generate_sequence_by_rnn_lm/images/ngram.png
--- a/generate_sequence_by_rnn_lm/images/rnn.png
+++ b/generate_sequence_by_rnn_lm/images/rnn.png
--- a/generate_sequence_by_rnn_lm/index.html
+++ b/generate_sequence_by_rnn_lm/index.html
--- a/generate_sequence_by_rnn_lm/network_conf.py
+++ b/generate_sequence_by_rnn_lm/network_conf.py
--- a/generate_sequence_by_rnn_lm/reader.py
+++ b/generate_sequence_by_rnn_lm/reader.py
+# coding=utf-8
+import collections
+import os
+MIN_LEN = 3
+MAX_LEN = 100
+def rnn_reader(file_name, word_dict):
+    """
+    create reader for RNN, each line is a sample.
+    :param file_name: file name.
+    :param min_sentence_length: sentence's min length.
+    :param max_sentence_length: sentence's max length.
+    :param word_dict: vocab with content of '{word, id}',
+                      'word' is string type , 'id' is int type.
+    :return: data reader.
+    """
+    def reader():
+        UNK_ID = word_dict['<unk>']
+        with open(file_name) as file:
+            for line in file:
+                words = line.strip().lower().split()
+                if len(words) < MIN_LEN or len(words) > MAX_LEN:
+                    continue
+                ids = [word_dict.get(w, UNK_ID)
+                       for w in words] + [word_dict['<e>']]
+                yield ids[:-1], ids[1:]
+    return reader
--- a/generate_sequence_by_rnn_lm/train.py
+++ b/generate_sequence_by_rnn_lm/train.py
--- a/generate_sequence_by_rnn_lm/utils.py
+++ b/generate_sequence_by_rnn_lm/utils.py
--- a/hsigmoid/.gitignore
+++ b/hsigmoid/.gitignore
--- a/hsigmoid/README.md
+++ b/hsigmoid/README.md
--- a/hsigmoid/images/binary_tree.png
+++ b/hsigmoid/images/binary_tree.png
--- a/hsigmoid/images/network_conf.png
+++ b/hsigmoid/images/network_conf.png
--- a/hsigmoid/images/path_to_1.png
+++ b/hsigmoid/images/path_to_1.png
--- a/hsigmoid/index.html
+++ b/hsigmoid/index.html
--- a/hsigmoid/infer.py
+++ b/hsigmoid/infer.py
--- a/hsigmoid/network_conf.py
+++ b/hsigmoid/network_conf.py
--- a/hsigmoid/train.py
+++ b/hsigmoid/train.py
--- a/image_classification/README.md
+++ b/image_classification/README.md
--- a/image_classification/alexnet.py
+++ b/image_classification/alexnet.py
--- a/image_classification/caffe2paddle/README.md
+++ b/image_classification/caffe2paddle/README.md
--- a/image_classification/caffe2paddle/caffe2paddle.py
+++ b/image_classification/caffe2paddle/caffe2paddle.py
--- a/image_classification/googlenet.py
+++ b/image_classification/googlenet.py
--- a/image_classification/infer.py
+++ b/image_classification/infer.py
--- a/image_classification/reader.py
+++ b/image_classification/reader.py
--- a/image_classification/resnet.py
+++ b/image_classification/resnet.py
--- a/image_classification/train.py
+++ b/image_classification/train.py
--- a/image_classification/vgg.py
+++ b/image_classification/vgg.py
--- a/language_model/README.md
+++ b/language_model/README.md
--- a/ltr/README.md
+++ b/ltr/README.md
--- a/ltr/images/lambdarank.jpg
+++ b/ltr/images/lambdarank.jpg
--- a/ltr/images/learning_to_rank.jpg
+++ b/ltr/images/learning_to_rank.jpg
--- a/ltr/images/ranknet.jpg
+++ b/ltr/images/ranknet.jpg
--- a/ltr/images/search_engine_example.png
+++ b/ltr/images/search_engine_example.png
--- a/ltr/index.html
+++ b/ltr/index.html
--- a/ltr/lambda_rank.py
+++ b/ltr/lambda_rank.py
--- a/ltr/metrics.py
+++ b/ltr/metrics.py
--- a/ltr/ranknet.py
+++ b/ltr/ranknet.py
--- a/nce_cost/.gitignore
+++ b/nce_cost/.gitignore
--- a/nce_cost/README.md
+++ b/nce_cost/README.md
--- a/nce_cost/images/network_conf.png
+++ b/nce_cost/images/network_conf.png
--- a/nce_cost/index.html
+++ b/nce_cost/index.html
--- a/nce_cost/infer.py
+++ b/nce_cost/infer.py
--- a/nce_cost/network_conf.py
+++ b/nce_cost/network_conf.py
--- a/nce_cost/train.py
+++ b/nce_cost/train.py
--- a/nested_sequence/README.md
+++ b/nested_sequence/README.md
--- a/nmt_without_attention/README.md
+++ b/nmt_without_attention/README.md
--- a/nmt_without_attention/generate.py
+++ b/nmt_without_attention/generate.py
--- a/nmt_without_attention/images/bidirectional-encoder.png
+++ b/nmt_without_attention/images/bidirectional-encoder.png
--- a/nmt_without_attention/images/encoder-decoder.png
+++ b/nmt_without_attention/images/encoder-decoder.png
--- a/nmt_without_attention/images/gru.png
+++ b/nmt_without_attention/images/gru.png
--- a/nmt_without_attention/index.html
+++ b/nmt_without_attention/index.html
--- a/nmt_without_attention/network_conf.py
+++ b/nmt_without_attention/network_conf.py
--- a/nmt_without_attention/train.py
+++ b/nmt_without_attention/train.py
--- a/ntm_addressing_mechanism/README.md
+++ b/ntm_addressing_mechanism/README.md
--- a/regression/README.md
+++ b/regression/README.md
--- a/scheduled_sampling/README.md
+++ b/scheduled_sampling/README.md
--- a/scheduled_sampling/images/Scheduled_Sampling.jpg
+++ b/scheduled_sampling/images/Scheduled_Sampling.jpg
--- a/scheduled_sampling/images/decay.jpg
+++ b/scheduled_sampling/images/decay.jpg
--- a/scheduled_sampling/index.html
+++ b/scheduled_sampling/index.html
--- a/scheduled_sampling/random_schedule_generator.py
+++ b/scheduled_sampling/random_schedule_generator.py
--- a/scheduled_sampling/scheduled_sampling.py
+++ b/scheduled_sampling/scheduled_sampling.py
--- a/seq2seq/README.md
+++ b/seq2seq/README.md
--- a/sequence_tagging_for_ner/.gitignore
+++ b/sequence_tagging_for_ner/.gitignore
--- a/sequence_tagging_for_ner/README.md
+++ b/sequence_tagging_for_ner/README.md
--- a/sequence_tagging_for_ner/data/download.sh
+++ b/sequence_tagging_for_ner/data/download.sh
--- a/sequence_tagging_for_ner/data/target.txt
+++ b/sequence_tagging_for_ner/data/target.txt
--- a/sequence_tagging_for_ner/data/test
+++ b/sequence_tagging_for_ner/data/test
--- a/sequence_tagging_for_ner/data/train
+++ b/sequence_tagging_for_ner/data/train
--- a/sequence_tagging_for_ner/data/vocab.txt
+++ b/sequence_tagging_for_ner/data/vocab.txt
--- a/sequence_tagging_for_ner/images/ner_label_ins.png
+++ b/sequence_tagging_for_ner/images/ner_label_ins.png
--- a/sequence_tagging_for_ner/images/ner_network.png
+++ b/sequence_tagging_for_ner/images/ner_network.png
--- a/sequence_tagging_for_ner/index.html
+++ b/sequence_tagging_for_ner/index.html
--- a/sequence_tagging_for_ner/infer.py
+++ b/sequence_tagging_for_ner/infer.py
--- a/sequence_tagging_for_ner/network_conf.py
+++ b/sequence_tagging_for_ner/network_conf.py
--- a/sequence_tagging_for_ner/reader.py
+++ b/sequence_tagging_for_ner/reader.py
--- a/sequence_tagging_for_ner/train.py
+++ b/sequence_tagging_for_ner/train.py
--- a/sequence_tagging_for_ner/utils.py
+++ b/sequence_tagging_for_ner/utils.py
--- a/text_classification/.gitignore
+++ b/text_classification/.gitignore
--- a/text_classification/README.md
+++ b/text_classification/README.md
--- a/text_classification/images/cnn_net.png
+++ b/text_classification/images/cnn_net.png
--- a/text_classification/images/dnn_net.png
+++ b/text_classification/images/dnn_net.png
--- a/text_classification/index.html
+++ b/text_classification/index.html
--- a/text_classification/infer.py
+++ b/text_classification/infer.py
--- a/text_classification/network_conf.py
+++ b/text_classification/network_conf.py
--- a/text_classification/reader.py
+++ b/text_classification/reader.py
--- a/text_classification/run.sh
+++ b/text_classification/run.sh
--- a/text_classification/train.py
+++ b/text_classification/train.py
--- a/text_classification/utils.py
+++ b/text_classification/utils.py
--- a/word_embedding/README.md
+++ b/word_embedding/README.md