diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..30863c27a8fdc1e9c94c8cd741bfc0d7031fc912
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,29 @@
+# This file is used by clang-format to autoformat paddle source code
+#
+# The clang-format is part of llvm toolchain.
+# It need to install llvm and clang to format source code style.
+#
+# The basic usage is,
+#   clang-format -i -style=file PATH/TO/SOURCE/CODE
+#
+# The -style=file implicit use ".clang-format" file located in one of
+# parent directory.
+# The -i means inplace change.
+#
+# The document of clang-format is
+#   http://clang.llvm.org/docs/ClangFormat.html
+#   http://clang.llvm.org/docs/ClangFormatStyleOptions.html
+---
+Language:        Cpp
+BasedOnStyle:  Google
+IndentWidth:     2
+TabWidth:        2
+ContinuationIndentWidth: 4
+MaxEmptyLinesToKeep: 2
+AccessModifierOffset: -2  # The private/protected/public has no indent in class
+Standard:  Cpp11
+AllowAllParametersOfDeclarationOnNextLine: true
+BinPackParameters: false
+BinPackArguments: false
+...
+
diff --git a/.clang_format.hook b/.clang_format.hook
new file mode 100755
index 0000000000000000000000000000000000000000..9db4fe4550c44fdb60e48818841d99ba5a081f46
--- /dev/null
+++ b/.clang_format.hook
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+# clang-format hook without version check
+
+clang-format $@
diff --git a/.gitignore b/.gitignore
index e43b0f988953ae3a84b00331d0ccf5f7d51cb3cf..dde3895fc112ad34a839b2fed9210ac2288a959b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .DS_Store
+*.pyc
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5ed1f4c4beb6eae16d319b25ec9959b61fe3fbc3..8ff36e098ba9ea25faec99ef2bf5ced768483975 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,6 +25,14 @@
         files: \.md$
     -   id: remove-tabs
         files: \.md$
+-   repo: local
+    hooks:
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat
+        entry: bash .clang_format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
 -   repo: local
     hooks:
     -   id: convert-markdown-into-html
diff --git a/.travis.yml b/.travis.yml
index a5559a796351bb10eacd4739f719b054e085160c..cb1837527b095e9bc8f7eb23d5baae44fc9150e7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ addons:
       - python-pip
       - python2.7-dev
 before_install:
-  -  pip install -U virtualenv pre-commit pip
+  -  sudo pip install -U virtualenv pre-commit pip
   -  docker pull paddlepaddle/paddle:latest
 script:
   -  .travis/precommit.sh
diff --git a/.travis/unittest.sh b/.travis/unittest.sh
index ad223eb4a9c1f57896762ad38d0b3fa5de5c496b..4195a441eac5f091e49b6203dbd2c637fee6ab69 100755
--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
@@ -10,6 +10,7 @@ unittest(){
     cd $1 > /dev/null
     if [ -f "setup.sh" ]; then
         sh setup.sh
+        export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
     fi
     if [ $? != 0 ]; then
         exit 1
diff --git a/ctr/README.md b/ctr/README.md
index 9332a8516e72d9df6c81ee0faf7c44264e43c0a6..1b2a73757575404dfd7790440bf9f27617084c1d 100644
--- a/ctr/README.md
+++ b/ctr/README.md
@@ -1,14 +1,29 @@
 # 点击率预估
 
+以下是本例目录包含的文件以及对应说明:
+
+```
+├── README.md               # 本教程markdown 文档
+├── dataset.md              # 数据集处理教程
+├── images                  # 本教程图片目录
+│   ├── lr_vs_dnn.jpg
+│   └── wide_deep.png
+├── infer.py                # 预测脚本
+├── network_conf.py         # 模型网络配置
+├── reader.py               # data reader
+├── train.py                # 训练脚本
+└── utils.py                # helper functions
+└── avazu_data_processer.py # 示例数据预处理脚本
+```
+
 ## 背景介绍
 
-CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
-通常被用来衡量一个在线广告系统的有效性。
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\]
+是对用户点击一个特定链接的概率做出预测，是广告投放过程中的一个重要环节。精准的点击率预估对在线广告系统收益最大化具有重要意义。
 
-当有多个广告位时，CTR 预估一般会作为排序的基准。
-比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+当有多个广告位时，CTR 预估一般会作为排序的基准，比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
 
-1.  召回满足 query 的广告集合
+1.  获取与用户搜索词相关的广告集合
 2.  业务规则和相关性过滤
 3.  根据拍卖机制和 CTR 排序
 4.  展出广告
@@ -36,13 +51,11 @@ Figure 1. LR 和 DNN 模型结构对比
 </p>
 
 LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
-但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
-
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）；
 如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
 这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
 
-LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
-
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法；
 而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
 这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
 
@@ -59,10 +72,62 @@ LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括
 
 我们直接使用第一种方法做分类任务。
 
-我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示本例中的模型。
+
+具体的特征处理方法参看 [data process](./dataset.md)。
+
+本教程中演示模型的输入格式如下：
+
+```
+# <dnn input ids> \t <lr input sparse values> \t click
+1 23 190 \t 230:0.12 3421:0.9 23451:0.12 \t 0
+23 231 \t 1230:0.12 13421:0.9 \t 1
+```
+
+详细的格式描述如下：
+
+- `dnn input ids` 采用 one-hot 表示，只需要填写值为1的ID（注意这里不是变长输入）
+- `lr input sparse values` 使用了 `ID:VALUE` 的表示，值部分最好规约到值域 `[-1, 1]`。
+
+此外，模型训练时需要传入一个文件描述 dnn 和 lr两个子模型的输入维度，文件的格式如下：
 
-具体的特征处理方法参看 [data process](./dataset.md)
+```
+dnn_input_dim: <int>
+lr_input_dim: <int>
+```
+
+其中， `<int>` 表示一个整型数值。
+
+本目录下的 `avazu_data_processor.py` 可以对下载的演示数据集\[[2](#参考文档)\] 进行处理，具体使用方法参考如下说明：
+
+```
+usage: avazu_data_processer.py [-h] --data_path DATA_PATH --output_dir
+                               OUTPUT_DIR
+                               [--num_lines_to_detect NUM_LINES_TO_DETECT]
+                               [--test_set_size TEST_SET_SIZE]
+                               [--train_size TRAIN_SIZE]
+
+PaddlePaddle CTR example
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+                        path of the Avazu dataset
+  --output_dir OUTPUT_DIR
+                        directory to output
+  --num_lines_to_detect NUM_LINES_TO_DETECT
+                        number of records to detect dataset's meta info
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --train_size TRAIN_SIZE
+                        size of the trainset (default: 100000)
+```
 
+- `data_path` 是待处理的数据路径
+- `output_dir` 生成数据的输出路径
+- `num_lines_to_detect` 预先扫描数据生成ID的个数，这里是扫描的文件行数
+- `test_set_size` 生成测试集的行数
+- `train_size` 生成训练姐的行数
 
 ## Wide & Deep Learning Model
 
@@ -201,18 +266,20 @@ trainer.train(
 ## 运行训练和测试
 训练模型需要如下步骤：
 
-1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+1. 准备训练数据
     1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
     2. 解压 train.gz 得到 train.txt
-2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+    3. `mkdir -p output; python avazu_data_processer.py --data_path train.txt --output_dir output --num_lines_to_detect 1000 --test_set_size 100` 生成演示数据
+2. 执行 `python train.py --train_data_path ./output/train.txt --test_data_path ./output/test.txt --data_meta_file ./output/data.meta.txt --model_type=0` 开始训练
 
 上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
 
 ```
 usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
-                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--test_data_path TEST_DATA_PATH] [--batch_size BATCH_SIZE]
                 [--num_passes NUM_PASSES]
-                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+                [--model_output_prefix MODEL_OUTPUT_PREFIX] --data_meta_file
+                DATA_META_FILE --model_type MODEL_TYPE
 
 PaddlePaddle CTR example
 
@@ -220,16 +287,78 @@ optional arguments:
   -h, --help            show this help message and exit
   --train_data_path TRAIN_DATA_PATH
                         path of training dataset
+  --test_data_path TEST_DATA_PATH
+                        path of testing dataset
   --batch_size BATCH_SIZE
                         size of mini-batch (default:10000)
-  --test_set_size TEST_SET_SIZE
-                        size of the validation dataset(default: 10000)
   --num_passes NUM_PASSES
                         number of passes to train
-  --num_lines_to_detact NUM_LINES_TO_DETACT
-                        number of records to detect dataset's meta info
+  --model_output_prefix MODEL_OUTPUT_PREFIX
+                        prefix of path for model to store (default:
+                        ./ctr_models)
+  --data_meta_file DATA_META_FILE
+                        path of data meta info file
+  --model_type MODEL_TYPE
+                        model type, classification: 0, regression 1 (default
+                        classification)
+```
+
+- `train_data_path` ： 训练集的路径
+- `test_data_path` : 测试集的路径
+- `num_passes`: 模型训练多少轮
+- `data_meta_file`: 参考[数据和任务抽象](### 数据和任务抽象)的描述。
+- `model_type`: 模型分类或回归
+
+
+## 用训好的模型做预测
+训好的模型可以用来预测新的数据， 预测数据的格式为
+
+```
+# <dnn input ids> \t <lr input sparse values>
+1 23 190 \t 230:0.12 3421:0.9 23451:0.12
+23 231 \t 1230:0.12 13421:0.9
 ```
 
+这里与训练数据的格式唯一不同的地方，就是没有标签，也就是训练数据中第3列 `click` 对应的数值。
+
+`infer.py` 的使用方法如下
+
+```
+usage: infer.py [-h] --model_gz_path MODEL_GZ_PATH --data_path DATA_PATH
+                --prediction_output_path PREDICTION_OUTPUT_PATH
+                [--data_meta_path DATA_META_PATH] --model_type MODEL_TYPE
+
+PaddlePaddle CTR example
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model_gz_path MODEL_GZ_PATH
+                        path of model parameters gz file
+  --data_path DATA_PATH
+                        path of the dataset to infer
+  --prediction_output_path PREDICTION_OUTPUT_PATH
+                        path to output the prediction
+  --data_meta_path DATA_META_PATH
+                        path of trainset's meta info, default is ./data.meta
+  --model_type MODEL_TYPE
+                        model type, classification: 0, regression 1 (default
+                        classification)
+```
+
+- `model_gz_path_model`：用 `gz` 压缩过的模型路径
+- `data_path` ： 需要预测的数据路径
+- `prediction_output_paht`：预测输出的路径
+- `data_meta_file` ：参考[数据和任务抽象](### 数据和任务抽象)的描述。
+- `model_type` ：分类或回归
+
+示例数据可以用如下命令预测
+
+```
+python infer.py --model_gz_path <model_path> --data_path output/infer.txt --prediction_output_path predictions.txt --data_meta_path data.meta.txt
+```
+
+最终的预测结果位于 `predictions.txt`。
+
 ## 参考文献
 1. <https://en.wikipedia.org/wiki/Click-through_rate>
 2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
diff --git a/ctr/avazu_data_processer.py b/ctr/avazu_data_processer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca150d8f35a866ae4d5bb07e4391cc7f32076e0f
--- /dev/null
+++ b/ctr/avazu_data_processer.py
@@ -0,0 +1,413 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-import os
+import sys
+import csv
+import cPickle
+import argparse
+import numpy as np
+
+from utils import logger, TaskMode
+
+parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+parser.add_argument(
+    '--data_path', type=str, required=True, help="path of the Avazu dataset")
+parser.add_argument(
+    '--output_dir', type=str, required=True, help="directory to output")
+parser.add_argument(
+    '--num_lines_to_detect',
+    type=int,
+    default=500000,
+    help="number of records to detect dataset's meta info")
+parser.add_argument(
+    '--test_set_size',
+    type=int,
+    default=10000,
+    help="size of the validation dataset(default: 10000)")
+parser.add_argument(
+    '--train_size',
+    type=int,
+    default=100000,
+    help="size of the trainset (default: 100000)")
+args = parser.parse_args()
+'''
+The fields of the dataset are:
+
+    0. id: ad identifier
+    1. click: 0/1 for non-click/click
+    2. hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
+    3. C1 -- anonymized categorical variable
+    4. banner_pos
+    5. site_id
+    6. site_domain
+    7. site_category
+    8. app_id
+    9. app_domain
+    10. app_category
+    11. device_id
+    12. device_ip
+    13. device_model
+    14. device_type
+    15. device_conn_type
+    16. C14-C21 -- anonymized categorical variables
+
+We will treat the following fields as categorical features:
+
+    - C1
+    - banner_pos
+    - site_category
+    - app_category
+    - device_type
+    - device_conn_type
+
+and some other features as id features:
+
+    - id
+    - site_id
+    - app_id
+    - device_id
+
+The `hour` field will be treated as a continuous feature and will be transformed
+to one-hot representation which has 24 bits.
+
+This script will output 3 files:
+
+1. train.txt
+2. test.txt
+3. infer.txt
+
+all the files are for demo.
+'''
+
+feature_dims = {}
+
+categorial_features = ('C1 banner_pos site_category app_category ' +
+                       'device_type device_conn_type').split()
+
+id_features = 'id site_id app_id device_id _device_id_cross_site_id'.split()
+
+
+def get_all_field_names(mode=0):
+    '''
+    @mode: int
+        0 for train, 1 for test
+    @return: list of str
+    '''
+    return categorial_features + ['hour'] + id_features + ['click'] \
+        if mode == 0 else []
+
+
+class CategoryFeatureGenerator(object):
+    '''
+    Generator category features.
+
+    Register all records by calling `register` first, then call `gen` to generate
+    one-hot representation for a record.
+    '''
+
+    def __init__(self):
+        self.dic = {'unk': 0}
+        self.counter = 1
+
+    def register(self, key):
+        '''
+        Register record.
+        '''
+        if key not in self.dic:
+            self.dic[key] = self.counter
+            self.counter += 1
+
+    def size(self):
+        return len(self.dic)
+
+    def gen(self, key):
+        '''
+        Generate one-hot representation for a record.
+        '''
+        if key not in self.dic:
+            res = self.dic['unk']
+        else:
+            res = self.dic[key]
+        return [res]
+
+    def __repr__(self):
+        return '<CategoryFeatureGenerator %d>' % len(self.dic)
+
+
+class IDfeatureGenerator(object):
+    def __init__(self, max_dim, cross_fea0=None, cross_fea1=None):
+        '''
+        @max_dim: int
+            Size of the id elements' space
+        '''
+        self.max_dim = max_dim
+        self.cross_fea0 = cross_fea0
+        self.cross_fea1 = cross_fea1
+
+    def gen(self, key):
+        '''
+        Generate one-hot representation for records
+        '''
+        return [hash(key) % self.max_dim]
+
+    def gen_cross_fea(self, fea1, fea2):
+        key = str(fea1) + str(fea2)
+        return self.gen(key)
+
+    def size(self):
+        return self.max_dim
+
+
+class ContinuousFeatureGenerator(object):
+    def __init__(self, n_intervals):
+        self.min = sys.maxint
+        self.max = sys.minint
+        self.n_intervals = n_intervals
+
+    def register(self, val):
+        self.min = min(self.minint, val)
+        self.max = max(self.maxint, val)
+
+    def gen(self, val):
+        self.len_part = (self.max - self.min) / self.n_intervals
+        return (val - self.min) / self.len_part
+
+
+# init all feature generators
+fields = {}
+for key in categorial_features:
+    fields[key] = CategoryFeatureGenerator()
+for key in id_features:
+    # for cross features
+    if 'cross' in key:
+        feas = key[1:].split('_cross_')
+        fields[key] = IDfeatureGenerator(10000000, *feas)
+    # for normal ID features
+    else:
+        fields[key] = IDfeatureGenerator(10000)
+
+# used as feed_dict in PaddlePaddle
+field_index = dict((key, id)
+                   for id, key in enumerate(['dnn_input', 'lr_input', 'click']))
+
+
+def detect_dataset(path, topn, id_fea_space=10000):
+    '''
+    Parse the first `topn` records to collect meta information of this dataset.
+
+    NOTE the records should be randomly shuffled first.
+    '''
+    # create categorical statis objects.
+    logger.warning('detecting dataset')
+
+    with open(path, 'rb') as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row_id, row in enumerate(reader):
+            if row_id > topn:
+                break
+
+            for key in categorial_features:
+                fields[key].register(row[key])
+
+    for key, item in fields.items():
+        feature_dims[key] = item.size()
+
+    feature_dims['hour'] = 24
+    feature_dims['click'] = 1
+
+    feature_dims['dnn_input'] = np.sum(
+        feature_dims[key] for key in categorial_features + ['hour']) + 1
+    feature_dims['lr_input'] = np.sum(feature_dims[key]
+                                      for key in id_features) + 1
+    return feature_dims
+
+
+def load_data_meta(meta_path):
+    '''
+    Load dataset's meta infomation.
+    '''
+    feature_dims, fields = cPickle.load(open(meta_path, 'rb'))
+    return feature_dims, fields
+
+
+def concat_sparse_vectors(inputs, dims):
+    '''
+    Concaterate more than one sparse vectors into one.
+
+    @inputs: list
+        list of sparse vector
+    @dims: list of int
+        dimention of each sparse vector
+    '''
+    res = []
+    assert len(inputs) == len(dims)
+    start = 0
+    for no, vec in enumerate(inputs):
+        for v in vec:
+            res.append(v + start)
+        start += dims[no]
+    return res
+
+
+class AvazuDataset(object):
+    '''
+    Load AVAZU dataset as train set.
+    '''
+
+    def __init__(self,
+                 train_path,
+                 n_records_as_test=-1,
+                 fields=None,
+                 feature_dims=None):
+        self.train_path = train_path
+        self.n_records_as_test = n_records_as_test
+        self.fields = fields
+        # default is train mode.
+        self.mode = TaskMode.create_train()
+
+        self.categorial_dims = [
+            feature_dims[key] for key in categorial_features + ['hour']
+        ]
+        self.id_dims = [feature_dims[key] for key in id_features]
+
+    def train(self):
+        '''
+        Load trainset.
+        '''
+        logger.info("load trainset from %s" % self.train_path)
+        self.mode = TaskMode.create_train()
+        with open(self.train_path) as f:
+            reader = csv.DictReader(f)
+
+            for row_id, row in enumerate(reader):
+                # skip top n lines
+                if self.n_records_as_test > 0 and row_id < self.n_records_as_test:
+                    continue
+
+                rcd = self._parse_record(row)
+                if rcd:
+                    yield rcd
+
+    def test(self):
+        '''
+        Load testset.
+        '''
+        logger.info("load testset from %s" % self.train_path)
+        self.mode = TaskMode.create_test()
+        with open(self.train_path) as f:
+            reader = csv.DictReader(f)
+
+            for row_id, row in enumerate(reader):
+                # skip top n lines
+                if self.n_records_as_test > 0 and row_id > self.n_records_as_test:
+                    break
+
+                rcd = self._parse_record(row)
+                if rcd:
+                    yield rcd
+
+    def infer(self):
+        '''
+        Load inferset.
+        '''
+        logger.info("load inferset from %s" % self.train_path)
+        self.mode = TaskMode.create_infer()
+        with open(self.train_path) as f:
+            reader = csv.DictReader(f)
+
+            for row_id, row in enumerate(reader):
+                rcd = self._parse_record(row)
+                if rcd:
+                    yield rcd
+
+    def _parse_record(self, row):
+        '''
+        Parse a CSV row and get a record.
+        '''
+        record = []
+        for key in categorial_features:
+            record.append(self.fields[key].gen(row[key]))
+        record.append([int(row['hour'][-2:])])
+        dense_input = concat_sparse_vectors(record, self.categorial_dims)
+
+        record = []
+        for key in id_features:
+            if 'cross' not in key:
+                record.append(self.fields[key].gen(row[key]))
+            else:
+                fea0 = self.fields[key].cross_fea0
+                fea1 = self.fields[key].cross_fea1
+                record.append(
+                    self.fields[key].gen_cross_fea(row[fea0], row[fea1]))
+
+        sparse_input = concat_sparse_vectors(record, self.id_dims)
+
+        record = [dense_input, sparse_input]
+
+        if not self.mode.is_infer():
+            record.append(list((int(row['click']), )))
+        return record
+
+
+def ids2dense(vec, dim):
+    return vec
+
+
+def ids2sparse(vec):
+    return ["%d:1" % x for x in vec]
+
+
+detect_dataset(args.data_path, args.num_lines_to_detect)
+dataset = AvazuDataset(
+    args.data_path,
+    args.test_set_size,
+    fields=fields,
+    feature_dims=feature_dims)
+
+output_trainset_path = os.path.join(args.output_dir, 'train.txt')
+output_testset_path = os.path.join(args.output_dir, 'test.txt')
+output_infer_path = os.path.join(args.output_dir, 'infer.txt')
+output_meta_path = os.path.join(args.output_dir, 'data.meta.txt')
+
+with open(output_trainset_path, 'w') as f:
+    for id, record in enumerate(dataset.train()):
+        if id and id % 10000 == 0:
+            logger.info("load %d records" % id)
+        if id > args.train_size:
+            break
+        dnn_input, lr_input, click = record
+        dnn_input = ids2dense(dnn_input, feature_dims['dnn_input'])
+        lr_input = ids2sparse(lr_input)
+        line = "%s\t%s\t%d\n" % (' '.join(map(str, dnn_input)),
+                                 ' '.join(map(str, lr_input)), click[0])
+        f.write(line)
+    logger.info('write to %s' % output_trainset_path)
+
+with open(output_testset_path, 'w') as f:
+    for id, record in enumerate(dataset.test()):
+        dnn_input, lr_input, click = record
+        dnn_input = ids2dense(dnn_input, feature_dims['dnn_input'])
+        lr_input = ids2sparse(lr_input)
+        line = "%s\t%s\t%d\n" % (' '.join(map(str, dnn_input)),
+                                 ' '.join(map(str, lr_input)), click[0])
+        f.write(line)
+    logger.info('write to %s' % output_testset_path)
+
+with open(output_infer_path, 'w') as f:
+    for id, record in enumerate(dataset.infer()):
+        dnn_input, lr_input = record
+        dnn_input = ids2dense(dnn_input, feature_dims['dnn_input'])
+        lr_input = ids2sparse(lr_input)
+        line = "%s\t%s\n" % (' '.join(map(str, dnn_input)),
+                             ' '.join(map(str, lr_input)), )
+        f.write(line)
+        if id > args.test_set_size:
+            break
+    logger.info('write to %s' % output_infer_path)
+
+with open(output_meta_path, 'w') as f:
+    lines = [
+        "dnn_input_dim: %d" % feature_dims['dnn_input'],
+        "lr_input_dim: %d" % feature_dims['lr_input']
+    ]
+    f.write('\n'.join(lines))
+    logger.info('write data meta into %s' % output_meta_path)
diff --git a/ctr/data_provider.py b/ctr/data_provider.py
deleted file mode 100644
index f02d3d33e75163cf772921ef54729a3fc8da022b..0000000000000000000000000000000000000000
--- a/ctr/data_provider.py
+++ /dev/null
@@ -1,277 +0,0 @@
-import sys
-import csv
-import numpy as np
-'''
-The fields of the dataset are:
-
-    0. id: ad identifier
-    1. click: 0/1 for non-click/click
-    2. hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.
-    3. C1 -- anonymized categorical variable
-    4. banner_pos
-    5. site_id
-    6. site_domain
-    7. site_category
-    8. app_id
-    9. app_domain
-    10. app_category
-    11. device_id
-    12. device_ip
-    13. device_model
-    14. device_type
-    15. device_conn_type
-    16. C14-C21 -- anonymized categorical variables
-
-We will treat following fields as categorical features:
-
-    - C1
-    - banner_pos
-    - site_category
-    - app_category
-    - device_type
-    - device_conn_type
-
-and some other features as id features:
-
-    - id
-    - site_id
-    - app_id
-    - device_id
-
-The `hour` field will be treated as a continuous feature and will be transformed
-to one-hot representation which has 24 bits.
-'''
-
-feature_dims = {}
-
-categorial_features = ('C1 banner_pos site_category app_category ' +
-                       'device_type device_conn_type').split()
-
-id_features = 'id site_id app_id device_id _device_id_cross_site_id'.split()
-
-
-def get_all_field_names(mode=0):
-    '''
-    @mode: int
-        0 for train, 1 for test
-    @return: list of str
-    '''
-    return categorial_features + ['hour'] + id_features + ['click'] \
-        if mode == 0 else []
-
-
-class CategoryFeatureGenerator(object):
-    '''
-    Generator category features.
-
-    Register all records by calling `register` first, then call `gen` to generate
-    one-hot representation for a record.
-    '''
-
-    def __init__(self):
-        self.dic = {'unk': 0}
-        self.counter = 1
-
-    def register(self, key):
-        '''
-        Register record.
-        '''
-        if key not in self.dic:
-            self.dic[key] = self.counter
-            self.counter += 1
-
-    def size(self):
-        return len(self.dic)
-
-    def gen(self, key):
-        '''
-        Generate one-hot representation for a record.
-        '''
-        if key not in self.dic:
-            res = self.dic['unk']
-        else:
-            res = self.dic[key]
-        return [res]
-
-    def __repr__(self):
-        return '<CategoryFeatureGenerator %d>' % len(self.dic)
-
-
-class IDfeatureGenerator(object):
-    def __init__(self, max_dim, cross_fea0=None, cross_fea1=None):
-        '''
-        @max_dim: int
-            Size of the id elements' space
-        '''
-        self.max_dim = max_dim
-        self.cross_fea0 = cross_fea0
-        self.cross_fea1 = cross_fea1
-
-    def gen(self, key):
-        '''
-        Generate one-hot representation for records
-        '''
-        return [hash(key) % self.max_dim]
-
-    def gen_cross_fea(self, fea1, fea2):
-        key = str(fea1) + str(fea2)
-        return self.gen(key)
-
-    def size(self):
-        return self.max_dim
-
-
-class ContinuousFeatureGenerator(object):
-    def __init__(self, n_intervals):
-        self.min = sys.maxint
-        self.max = sys.minint
-        self.n_intervals = n_intervals
-
-    def register(self, val):
-        self.min = min(self.minint, val)
-        self.max = max(self.maxint, val)
-
-    def gen(self, val):
-        self.len_part = (self.max - self.min) / self.n_intervals
-        return (val - self.min) / self.len_part
-
-
-# init all feature generators
-fields = {}
-for key in categorial_features:
-    fields[key] = CategoryFeatureGenerator()
-for key in id_features:
-    # for cross features
-    if 'cross' in key:
-        feas = key[1:].split('_cross_')
-        fields[key] = IDfeatureGenerator(10000000, *feas)
-    # for normal ID features
-    else:
-        fields[key] = IDfeatureGenerator(10000)
-
-# used as feed_dict in PaddlePaddle
-field_index = dict((key, id)
-                   for id, key in enumerate(['dnn_input', 'lr_input', 'click']))
-
-
-def detect_dataset(path, topn, id_fea_space=10000):
-    '''
-    Parse the first `topn` records to collect meta information of this dataset.
-
-    NOTE the records should be randomly shuffled first.
-    '''
-    # create categorical statis objects.
-
-    with open(path, 'rb') as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row_id, row in enumerate(reader):
-            if row_id > topn:
-                break
-
-            for key in categorial_features:
-                fields[key].register(row[key])
-
-    for key, item in fields.items():
-        feature_dims[key] = item.size()
-
-    #for key in id_features:
-    #feature_dims[key] = id_fea_space
-
-    feature_dims['hour'] = 24
-    feature_dims['click'] = 1
-
-    feature_dims['dnn_input'] = np.sum(
-        feature_dims[key] for key in categorial_features + ['hour']) + 1
-    feature_dims['lr_input'] = np.sum(feature_dims[key]
-                                      for key in id_features) + 1
-
-    return feature_dims
-
-
-def concat_sparse_vectors(inputs, dims):
-    '''
-    Concaterate more than one sparse vectors into one.
-
-    @inputs: list
-        list of sparse vector
-    @dims: list of int
-        dimention of each sparse vector
-    '''
-    res = []
-    assert len(inputs) == len(dims)
-    start = 0
-    for no, vec in enumerate(inputs):
-        for v in vec:
-            res.append(v + start)
-        start += dims[no]
-    return res
-
-
-class AvazuDataset(object):
-    '''
-    Load AVAZU dataset as train set.
-    '''
-    TRAIN_MODE = 0
-    TEST_MODE = 1
-
-    def __init__(self, train_path, n_records_as_test=-1):
-        self.train_path = train_path
-        self.n_records_as_test = n_records_as_test
-        # task model: 0 train, 1 test
-        self.mode = 0
-
-    def train(self):
-        self.mode = self.TRAIN_MODE
-        return self._parse(self.train_path, skip_n_lines=self.n_records_as_test)
-
-    def test(self):
-        self.mode = self.TEST_MODE
-        return self._parse(self.train_path, top_n_lines=self.n_records_as_test)
-
-    def _parse(self, path, skip_n_lines=-1, top_n_lines=-1):
-        with open(path, 'rb') as csvfile:
-            reader = csv.DictReader(csvfile)
-
-            categorial_dims = [
-                feature_dims[key] for key in categorial_features + ['hour']
-            ]
-            id_dims = [feature_dims[key] for key in id_features]
-
-            for row_id, row in enumerate(reader):
-                if skip_n_lines > 0 and row_id < skip_n_lines:
-                    continue
-                if top_n_lines > 0 and row_id > top_n_lines:
-                    break
-
-                record = []
-                for key in categorial_features:
-                    record.append(fields[key].gen(row[key]))
-                record.append([int(row['hour'][-2:])])
-                dense_input = concat_sparse_vectors(record, categorial_dims)
-
-                record = []
-                for key in id_features:
-                    if 'cross' not in key:
-                        record.append(fields[key].gen(row[key]))
-                    else:
-                        fea0 = fields[key].cross_fea0
-                        fea1 = fields[key].cross_fea1
-                        record.append(
-                            fields[key].gen_cross_fea(row[fea0], row[fea1]))
-
-                sparse_input = concat_sparse_vectors(record, id_dims)
-
-                record = [dense_input, sparse_input]
-
-                record.append(list((int(row['click']), )))
-                yield record
-
-
-if __name__ == '__main__':
-    path = 'train.txt'
-    print detect_dataset(path, 400000)
-
-    filereader = AvazuDataset(path)
-    for no, rcd in enumerate(filereader.train()):
-        print no, rcd
-        if no > 1000: break
diff --git a/ctr/dataset.md b/ctr/dataset.md
index dd6443d56adaf548d6c39458900c711c7f274def..16c0f9784bf3409ac5bbe704f932a9b28680fbf8 100644
--- a/ctr/dataset.md
+++ b/ctr/dataset.md
@@ -1,6 +1,13 @@
 # 数据及处理
 ## 数据集介绍
 
+本教程演示使用Kaggle上CTR任务的数据集\[[3](#参考文献)\]的预处理方法，最终产生本模型需要的格式，详细的数据格式参考[README.md](./README.md)。
+
+Wide && Deep Model\[[2](#参考文献)\]的优势是融合稠密特征和大规模稀疏特征，
+因此特征处理方面也针对稠密和稀疏两种特征作处理，
+其中Deep部分的稠密值全部转化为ID类特征，
+通过embedding 来转化为稠密的向量输入；Wide部分主要通过ID的叉乘提升维度。
+
 数据集使用 `csv` 格式存储，其中各个字段内容如下：
 
 -   `id` : ad identifier
diff --git a/ctr/index.html b/ctr/index.html
index ff0c5d9b19ec046b61f7f38d6eb9e70dff33e1ec..f1df7456a7aef174254fb20f2710c78079f4f26f 100644
--- a/ctr/index.html
+++ b/ctr/index.html
@@ -42,15 +42,30 @@
 <div id="markdown" style='display:none'>
 # 点击率预估
 
+以下是本例目录包含的文件以及对应说明:
+
+```
+├── README.md               # 本教程markdown 文档
+├── dataset.md              # 数据集处理教程
+├── images                  # 本教程图片目录
+│   ├── lr_vs_dnn.jpg
+│   └── wide_deep.png
+├── infer.py                # 预测脚本
+├── network_conf.py         # 模型网络配置
+├── reader.py               # data reader
+├── train.py                # 训练脚本
+└── utils.py                # helper functions
+└── avazu_data_processer.py # 示例数据预处理脚本
+```
+
 ## 背景介绍
 
-CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\] 是用来表示用户点击一个特定链接的概率，
-通常被用来衡量一个在线广告系统的有效性。
+CTR(Click-Through Rate，点击率预估)\[[1](https://en.wikipedia.org/wiki/Click-through_rate)\]
+是对用户点击一个特定链接的概率做出预测，是广告投放过程中的一个重要环节。精准的点击率预估对在线广告系统收益最大化具有重要意义。
 
-当有多个广告位时，CTR 预估一般会作为排序的基准。
-比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
+当有多个广告位时，CTR 预估一般会作为排序的基准，比如在搜索引擎的广告系统里，当用户输入一个带商业价值的搜索词（query）时，系统大体上会执行下列步骤来展示广告：
 
-1.  召回满足 query 的广告集合
+1.  获取与用户搜索词相关的广告集合
 2.  业务规则和相关性过滤
 3.  根据拍卖机制和 CTR 排序
 4.  展出广告
@@ -78,13 +93,11 @@ Figure 1. LR 和 DNN 模型结构对比
 </p>
 
 LR 的蓝色箭头部分可以直接类比到 DNN 中对应的结构，可以看到 LR 和 DNN 有一些共通之处（比如权重累加），
-但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）。
-
+但前者的模型复杂度在相同输入维度下比后者可能低很多（从某方面讲，模型越复杂，越有潜力学习到更复杂的信息）；
 如果 LR 要达到匹敌 DNN 的学习能力，必须增加输入的维度，也就是增加特征的数量，
 这也就是为何 LR 和大规模的特征工程必须绑定在一起的原因。
 
-LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法。
-
+LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括内存和计算量等方面，工业界都有非常成熟的优化方法；
 而 DNN 模型具有自己学习新特征的能力，一定程度上能够提升特征使用的效率，
 这使得 DNN 模型在同样规模特征的情况下，更有可能达到更好的学习效果。
 
@@ -101,10 +114,62 @@ LR 对于 DNN 模型的优势是对大规模稀疏特征的容纳能力，包括
 
 我们直接使用第一种方法做分类任务。
 
-我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示模型。
+我们使用 Kaggle 上 `Click-through rate prediction` 任务的数据集\[[2](https://www.kaggle.com/c/avazu-ctr-prediction/data)\] 来演示本例中的模型。
+
+具体的特征处理方法参看 [data process](./dataset.md)。
+
+本教程中演示模型的输入格式如下：
+
+```
+# <dnn input ids> \t <lr input sparse values> \t click
+1 23 190 \t 230:0.12 3421:0.9 23451:0.12 \t 0
+23 231 \t 1230:0.12 13421:0.9 \t 1
+```
+
+详细的格式描述如下：
+
+- `dnn input ids` 采用 one-hot 表示，只需要填写值为1的ID（注意这里不是变长输入）
+- `lr input sparse values` 使用了 `ID:VALUE` 的表示，值部分最好规约到值域 `[-1, 1]`。
+
+此外，模型训练时需要传入一个文件描述 dnn 和 lr两个子模型的输入维度，文件的格式如下：
 
-具体的特征处理方法参看 [data process](./dataset.md)
+```
+dnn_input_dim: <int>
+lr_input_dim: <int>
+```
+
+其中， `<int>` 表示一个整型数值。
+
+本目录下的 `avazu_data_processor.py` 可以对下载的演示数据集\[[2](#参考文档)\] 进行处理，具体使用方法参考如下说明：
+
+```
+usage: avazu_data_processer.py [-h] --data_path DATA_PATH --output_dir
+                               OUTPUT_DIR
+                               [--num_lines_to_detect NUM_LINES_TO_DETECT]
+                               [--test_set_size TEST_SET_SIZE]
+                               [--train_size TRAIN_SIZE]
+
+PaddlePaddle CTR example
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --data_path DATA_PATH
+                        path of the Avazu dataset
+  --output_dir OUTPUT_DIR
+                        directory to output
+  --num_lines_to_detect NUM_LINES_TO_DETECT
+                        number of records to detect dataset's meta info
+  --test_set_size TEST_SET_SIZE
+                        size of the validation dataset(default: 10000)
+  --train_size TRAIN_SIZE
+                        size of the trainset (default: 100000)
+```
 
+- `data_path` 是待处理的数据路径
+- `output_dir` 生成数据的输出路径
+- `num_lines_to_detect` 预先扫描数据生成ID的个数，这里是扫描的文件行数
+- `test_set_size` 生成测试集的行数
+- `train_size` 生成训练姐的行数
 
 ## Wide & Deep Learning Model
 
@@ -243,18 +308,20 @@ trainer.train(
 ## 运行训练和测试
 训练模型需要如下步骤：
 
-1. 下载训练数据，可以使用 Kaggle 上 CTR 比赛的数据\[[2](#参考文献)\]
+1. 准备训练数据
     1. 从 [Kaggle CTR](https://www.kaggle.com/c/avazu-ctr-prediction/data) 下载 train.gz
     2. 解压 train.gz 得到 train.txt
-2. 执行 `python train.py --train_data_path train.txt` ，开始训练
+    3. `mkdir -p output; python avazu_data_processer.py --data_path train.txt --output_dir output --num_lines_to_detect 1000 --test_set_size 100` 生成演示数据
+2. 执行 `python train.py --train_data_path ./output/train.txt --test_data_path ./output/test.txt --data_meta_file ./output/data.meta.txt --model_type=0` 开始训练
 
 上面第2个步骤可以为 `train.py` 填充命令行参数来定制模型的训练过程，具体的命令行参数及用法如下
 
 ```
 usage: train.py [-h] --train_data_path TRAIN_DATA_PATH
-                [--batch_size BATCH_SIZE] [--test_set_size TEST_SET_SIZE]
+                [--test_data_path TEST_DATA_PATH] [--batch_size BATCH_SIZE]
                 [--num_passes NUM_PASSES]
-                [--num_lines_to_detact NUM_LINES_TO_DETACT]
+                [--model_output_prefix MODEL_OUTPUT_PREFIX] --data_meta_file
+                DATA_META_FILE --model_type MODEL_TYPE
 
 PaddlePaddle CTR example
 
@@ -262,16 +329,78 @@ optional arguments:
   -h, --help            show this help message and exit
   --train_data_path TRAIN_DATA_PATH
                         path of training dataset
+  --test_data_path TEST_DATA_PATH
+                        path of testing dataset
   --batch_size BATCH_SIZE
                         size of mini-batch (default:10000)
-  --test_set_size TEST_SET_SIZE
-                        size of the validation dataset(default: 10000)
   --num_passes NUM_PASSES
                         number of passes to train
-  --num_lines_to_detact NUM_LINES_TO_DETACT
-                        number of records to detect dataset's meta info
+  --model_output_prefix MODEL_OUTPUT_PREFIX
+                        prefix of path for model to store (default:
+                        ./ctr_models)
+  --data_meta_file DATA_META_FILE
+                        path of data meta info file
+  --model_type MODEL_TYPE
+                        model type, classification: 0, regression 1 (default
+                        classification)
+```
+
+- `train_data_path` ： 训练集的路径
+- `test_data_path` : 测试集的路径
+- `num_passes`: 模型训练多少轮
+- `data_meta_file`: 参考[数据和任务抽象](### 数据和任务抽象)的描述。
+- `model_type`: 模型分类或回归
+
+
+## 用训好的模型做预测
+训好的模型可以用来预测新的数据， 预测数据的格式为
+
+```
+# <dnn input ids> \t <lr input sparse values>
+1 23 190 \t 230:0.12 3421:0.9 23451:0.12
+23 231 \t 1230:0.12 13421:0.9
 ```
 
+这里与训练数据的格式唯一不同的地方，就是没有标签，也就是训练数据中第3列 `click` 对应的数值。
+
+`infer.py` 的使用方法如下
+
+```
+usage: infer.py [-h] --model_gz_path MODEL_GZ_PATH --data_path DATA_PATH
+                --prediction_output_path PREDICTION_OUTPUT_PATH
+                [--data_meta_path DATA_META_PATH] --model_type MODEL_TYPE
+
+PaddlePaddle CTR example
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model_gz_path MODEL_GZ_PATH
+                        path of model parameters gz file
+  --data_path DATA_PATH
+                        path of the dataset to infer
+  --prediction_output_path PREDICTION_OUTPUT_PATH
+                        path to output the prediction
+  --data_meta_path DATA_META_PATH
+                        path of trainset's meta info, default is ./data.meta
+  --model_type MODEL_TYPE
+                        model type, classification: 0, regression 1 (default
+                        classification)
+```
+
+- `model_gz_path_model`：用 `gz` 压缩过的模型路径
+- `data_path` ： 需要预测的数据路径
+- `prediction_output_paht`：预测输出的路径
+- `data_meta_file` ：参考[数据和任务抽象](### 数据和任务抽象)的描述。
+- `model_type` ：分类或回归
+
+示例数据可以用如下命令预测
+
+```
+python infer.py --model_gz_path <model_path> --data_path output/infer.txt --prediction_output_path predictions.txt --data_meta_path data.meta.txt
+```
+
+最终的预测结果位于 `predictions.txt`。
+
 ## 参考文献
 1. <https://en.wikipedia.org/wiki/Click-through_rate>
 2. <https://www.kaggle.com/c/avazu-ctr-prediction/data>
diff --git a/ctr/infer.py b/ctr/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..721c6b01b5a82b863e7db69865cd62c496b382d9
--- /dev/null
+++ b/ctr/infer.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import gzip
+import argparse
+import itertools
+
+import paddle.v2 as paddle
+import network_conf
+from train import dnn_layer_dims
+import reader
+from utils import logger, ModelType
+
+parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+parser.add_argument(
+    '--model_gz_path',
+    type=str,
+    required=True,
+    help="path of model parameters gz file")
+parser.add_argument(
+    '--data_path', type=str, required=True, help="path of the dataset to infer")
+parser.add_argument(
+    '--prediction_output_path',
+    type=str,
+    required=True,
+    help="path to output the prediction")
+parser.add_argument(
+    '--data_meta_path',
+    type=str,
+    default="./data.meta",
+    help="path of trainset's meta info, default is ./data.meta")
+parser.add_argument(
+    '--model_type',
+    type=int,
+    required=True,
+    default=ModelType.CLASSIFICATION,
+    help='model type, classification: %d, regression %d (default classification)'
+    % (ModelType.CLASSIFICATION, ModelType.REGRESSION))
+
+args = parser.parse_args()
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+
+class CTRInferer(object):
+    def __init__(self, param_path):
+        logger.info("create CTR model")
+        dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_path)
+        # create the mdoel
+        self.ctr_model = network_conf.CTRmodel(
+            dnn_layer_dims,
+            dnn_input_dim,
+            lr_input_dim,
+            model_type=ModelType(args.model_type),
+            is_infer=True)
+        # load parameter
+        logger.info("load model parameters from %s" % param_path)
+        self.parameters = paddle.parameters.Parameters.from_tar(
+            gzip.open(param_path, 'r'))
+        self.inferer = paddle.inference.Inference(
+            output_layer=self.ctr_model.model,
+            parameters=self.parameters, )
+
+    def infer(self, data_path):
+        logger.info("infer data...")
+        dataset = reader.Dataset()
+        infer_reader = paddle.batch(
+            dataset.infer(args.data_path), batch_size=1000)
+        logger.warning('write predictions to %s' % args.prediction_output_path)
+        output_f = open(args.prediction_output_path, 'w')
+        for id, batch in enumerate(infer_reader()):
+            res = self.inferer.infer(input=batch)
+            predictions = [x for x in itertools.chain.from_iterable(res)]
+            assert len(batch) == len(
+                predictions), "predict error, %d inputs, but %d predictions" % (
+                    len(batch), len(predictions))
+            output_f.write('\n'.join(map(str, predictions)) + '\n')
+
+
+if __name__ == '__main__':
+    ctr_inferer = CTRInferer(args.model_gz_path)
+    ctr_inferer.infer(args.data_path)
diff --git a/ctr/network_conf.py b/ctr/network_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..a90d1dc6e8da0b1379926dd7b37ac4cc0d408a2a
--- /dev/null
+++ b/ctr/network_conf.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import paddle.v2 as paddle
+from paddle.v2 import layer
+from paddle.v2 import data_type as dtype
+from utils import logger, ModelType
+
+
+class CTRmodel(object):
+    '''
+    A CTR model which implements wide && deep learning model.
+    '''
+
+    def __init__(self,
+                 dnn_layer_dims,
+                 dnn_input_dim,
+                 lr_input_dim,
+                 model_type=ModelType.create_classification(),
+                 is_infer=False):
+        '''
+        @dnn_layer_dims: list of integer
+            dims of each layer in dnn
+        @dnn_input_dim: int
+            size of dnn's input layer
+        @lr_input_dim: int
+            size of lr's input layer
+        @is_infer: bool
+            whether to build a infer model
+        '''
+        self.dnn_layer_dims = dnn_layer_dims
+        self.dnn_input_dim = dnn_input_dim
+        self.lr_input_dim = lr_input_dim
+        self.model_type = model_type
+        self.is_infer = is_infer
+
+        self._declare_input_layers()
+
+        self.dnn = self._build_dnn_submodel_(self.dnn_layer_dims)
+        self.lr = self._build_lr_submodel_()
+
+        # model's prediction
+        # TODO(superjom) rename it to prediction
+        if self.model_type.is_classification():
+            self.model = self._build_classification_model(self.dnn, self.lr)
+        if self.model_type.is_regression():
+            self.model = self._build_regression_model(self.dnn, self.lr)
+
+    def _declare_input_layers(self):
+        self.dnn_merged_input = layer.data(
+            name='dnn_input',
+            type=paddle.data_type.sparse_binary_vector(self.dnn_input_dim))
+
+        self.lr_merged_input = layer.data(
+            name='lr_input',
+            type=paddle.data_type.sparse_vector(self.lr_input_dim))
+
+        if not self.is_infer:
+            self.click = paddle.layer.data(
+                name='click', type=dtype.dense_vector(1))
+
+    def _build_dnn_submodel_(self, dnn_layer_dims):
+        '''
+        build DNN submodel.
+        '''
+        dnn_embedding = layer.fc(
+            input=self.dnn_merged_input, size=dnn_layer_dims[0])
+        _input_layer = dnn_embedding
+        for i, dim in enumerate(dnn_layer_dims[1:]):
+            fc = layer.fc(
+                input=_input_layer,
+                size=dim,
+                act=paddle.activation.Relu(),
+                name='dnn-fc-%d' % i)
+            _input_layer = fc
+        return _input_layer
+
+    def _build_lr_submodel_(self):
+        '''
+        config LR submodel
+        '''
+        fc = layer.fc(
+            input=self.lr_merged_input, size=1, act=paddle.activation.Relu())
+        return fc
+
+    def _build_classification_model(self, dnn, lr):
+        merge_layer = layer.concat(input=[dnn, lr])
+        self.output = layer.fc(
+            input=merge_layer,
+            size=1,
+            # use sigmoid function to approximate ctr rate, a float value between 0 and 1.
+            act=paddle.activation.Sigmoid())
+
+        if not self.is_infer:
+            self.train_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+                input=self.output, label=self.click)
+        return self.output
+
+    def _build_regression_model(self, dnn, lr):
+        merge_layer = layer.concat(input=[dnn, lr])
+        self.output = layer.fc(
+            input=merge_layer, size=1, act=paddle.activation.Sigmoid())
+        if not self.is_infer:
+            self.train_cost = paddle.layer.square_error_cost(
+                input=self.output, label=self.click)
+        return self.output
diff --git a/ctr/reader.py b/ctr/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d511b9d7c3b3a843b1e2819481b377ba5a49ce1a
--- /dev/null
+++ b/ctr/reader.py
@@ -0,0 +1,66 @@
+from utils import logger, TaskMode, load_dnn_input_record, load_lr_input_record
+
+feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2}
+
+
+class Dataset(object):
+    def __init__(self):
+        self.mode = TaskMode.create_train()
+
+    def train(self, path):
+        '''
+        Load trainset.
+        '''
+        logger.info("load trainset from %s" % path)
+        self.mode = TaskMode.create_train()
+        self.path = path
+        return self._parse
+
+    def test(self, path):
+        '''
+        Load testset.
+        '''
+        logger.info("load testset from %s" % path)
+        self.path = path
+        self.mode = TaskMode.create_test()
+        return self._parse
+
+    def infer(self, path):
+        '''
+        Load infer set.
+        '''
+        logger.info("load inferset from %s" % path)
+        self.path = path
+        self.mode = TaskMode.create_infer()
+        return self._parse
+
+    def _parse(self):
+        '''
+        Parse dataset.
+        '''
+        with open(self.path) as f:
+            for line_id, line in enumerate(f):
+                fs = line.strip().split('\t')
+                dnn_input = load_dnn_input_record(fs[0])
+                lr_input = load_lr_input_record(fs[1])
+                if not self.mode.is_infer():
+                    click = [int(fs[2])]
+                    yield dnn_input, lr_input, click
+                else:
+                    yield dnn_input, lr_input
+
+
+def load_data_meta(path):
+    '''
+    load data meta info from path, return (dnn_input_dim, lr_input_dim)
+    '''
+    with open(path) as f:
+        lines = f.read().split('\n')
+        err_info = "wrong meta format"
+        assert len(lines) == 2, err_info
+        assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[
+            1], err_info
+        res = map(int, [_.split(':')[1] for _ in lines])
+        logger.info('dnn input dim: %d' % res[0])
+        logger.info('lr input dim: %d' % res[1])
+        return res
diff --git a/ctr/train.py b/ctr/train.py
index da6dc9dd6d9e386a87693b5a5bc0cbf95da0b069..64831089ae1b1df4cb73326824af71acb345f80d 100644
--- a/ctr/train.py
+++ b/ctr/train.py
@@ -1,138 +1,113 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
+# -*- coding: utf-8 -*-import os
 import argparse
-import logging
-import paddle.v2 as paddle
-from paddle.v2 import layer
-from paddle.v2 import data_type as dtype
-from data_provider import field_index, detect_dataset, AvazuDataset
-
-parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
-parser.add_argument(
-    '--train_data_path',
-    type=str,
-    required=True,
-    help="path of training dataset")
-parser.add_argument(
-    '--batch_size',
-    type=int,
-    default=10000,
-    help="size of mini-batch (default:10000)")
-parser.add_argument(
-    '--test_set_size',
-    type=int,
-    default=10000,
-    help="size of the validation dataset(default: 10000)")
-parser.add_argument(
-    '--num_passes', type=int, default=10, help="number of passes to train")
-parser.add_argument(
-    '--num_lines_to_detact',
-    type=int,
-    default=500000,
-    help="number of records to detect dataset's meta info")
-
-args = parser.parse_args()
-
-dnn_layer_dims = [128, 64, 32, 1]
-data_meta_info = detect_dataset(args.train_data_path, args.num_lines_to_detact)
-
-logging.warning('detect categorical fields in dataset %s' %
-                args.train_data_path)
-for key, item in data_meta_info.items():
-    logging.warning('    - {}\t{}'.format(key, item))
-
-paddle.init(use_gpu=False, trainer_count=1)
+import gzip
 
-# ==============================================================================
-#                    input layers
-# ==============================================================================
-dnn_merged_input = layer.data(
-    name='dnn_input',
-    type=paddle.data_type.sparse_binary_vector(data_meta_info['dnn_input']))
-
-lr_merged_input = layer.data(
-    name='lr_input',
-    type=paddle.data_type.sparse_binary_vector(data_meta_info['lr_input']))
-
-click = paddle.layer.data(name='click', type=dtype.dense_vector(1))
+import reader
+import paddle.v2 as paddle
+from utils import logger, ModelType
+from network_conf import CTRmodel
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        required=True,
+        help="path of training dataset")
+    parser.add_argument(
+        '--test_data_path', type=str, help='path of testing dataset')
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=10000,
+        help="size of mini-batch (default:10000)")
+    parser.add_argument(
+        '--num_passes', type=int, default=10, help="number of passes to train")
+    parser.add_argument(
+        '--model_output_prefix',
+        type=str,
+        default='./ctr_models',
+        help='prefix of path for model to store (default: ./ctr_models)')
+    parser.add_argument(
+        '--data_meta_file',
+        type=str,
+        required=True,
+        help='path of data meta info file', )
+    parser.add_argument(
+        '--model_type',
+        type=int,
+        required=True,
+        default=ModelType.CLASSIFICATION,
+        help='model type, classification: %d, regression %d (default classification)'
+        % (ModelType.CLASSIFICATION, ModelType.REGRESSION))
+
+    return parser.parse_args()
 
 
-# ==============================================================================
-#                    network structure
-# ==============================================================================
-def build_dnn_submodel(dnn_layer_dims):
-    dnn_embedding = layer.fc(input=dnn_merged_input, size=dnn_layer_dims[0])
-    _input_layer = dnn_embedding
-    for i, dim in enumerate(dnn_layer_dims[1:]):
-        fc = layer.fc(
-            input=_input_layer,
-            size=dim,
-            act=paddle.activation.Relu(),
-            name='dnn-fc-%d' % i)
-        _input_layer = fc
-    return _input_layer
-
-
-# config LR submodel
-def build_lr_submodel():
-    fc = layer.fc(
-        input=lr_merged_input, size=1, name='lr', act=paddle.activation.Relu())
-    return fc
-
-
-# conbine DNN and LR submodels
-def combine_submodels(dnn, lr):
-    merge_layer = layer.concat(input=[dnn, lr])
-    fc = layer.fc(
-        input=merge_layer,
-        size=1,
-        name='output',
-        # use sigmoid function to approximate ctr rate, a float value between 0 and 1.
-        act=paddle.activation.Sigmoid())
-    return fc
-
-
-dnn = build_dnn_submodel(dnn_layer_dims)
-lr = build_lr_submodel()
-output = combine_submodels(dnn, lr)
+dnn_layer_dims = [128, 64, 32, 1]
 
 # ==============================================================================
 #                   cost and train period
 # ==============================================================================
-classification_cost = paddle.layer.multi_binary_label_cross_entropy_cost(
-    input=output, label=click)
-
-params = paddle.parameters.create(classification_cost)
-
-optimizer = paddle.optimizer.Momentum(momentum=0.01)
-
-trainer = paddle.trainer.SGD(
-    cost=classification_cost, parameters=params, update_equation=optimizer)
-
-dataset = AvazuDataset(
-    args.train_data_path, n_records_as_test=args.test_set_size)
-
-
-def event_handler(event):
-    if isinstance(event, paddle.event.EndIteration):
-        num_samples = event.batch_id * args.batch_size
-        if event.batch_id % 100 == 0:
-            logging.warning("Pass %d, Samples %d, Cost %f" %
-                            (event.pass_id, num_samples, event.cost))
-
-        if event.batch_id % 1000 == 0:
-            result = trainer.test(
-                reader=paddle.batch(dataset.test, batch_size=args.batch_size),
-                feeding=field_index)
-            logging.warning("Test %d-%d, Cost %f" %
-                            (event.pass_id, event.batch_id, result.cost))
 
 
-trainer.train(
-    reader=paddle.batch(
-        paddle.reader.shuffle(dataset.train, buf_size=500),
-        batch_size=args.batch_size),
-    feeding=field_index,
-    event_handler=event_handler,
-    num_passes=args.num_passes)
+def train():
+    args = parse_args()
+    args.model_type = ModelType(args.model_type)
+    paddle.init(use_gpu=False, trainer_count=1)
+    dnn_input_dim, lr_input_dim = reader.load_data_meta(args.data_meta_file)
+
+    # create ctr model.
+    model = CTRmodel(
+        dnn_layer_dims,
+        dnn_input_dim,
+        lr_input_dim,
+        model_type=args.model_type,
+        is_infer=False)
+
+    params = paddle.parameters.create(model.train_cost)
+    optimizer = paddle.optimizer.AdaGrad()
+
+    trainer = paddle.trainer.SGD(
+        cost=model.train_cost, parameters=params, update_equation=optimizer)
+
+    dataset = reader.Dataset()
+
+    def __event_handler__(event):
+        if isinstance(event, paddle.event.EndIteration):
+            num_samples = event.batch_id * args.batch_size
+            if event.batch_id % 100 == 0:
+                logger.warning("Pass %d, Samples %d, Cost %f, %s" % (
+                    event.pass_id, num_samples, event.cost, event.metrics))
+
+            if event.batch_id % 1000 == 0:
+                if args.test_data_path:
+                    result = trainer.test(
+                        reader=paddle.batch(
+                            dataset.test(args.test_data_path),
+                            batch_size=args.batch_size),
+                        feeding=reader.feeding_index)
+                    logger.warning("Test %d-%d, Cost %f, %s" %
+                                   (event.pass_id, event.batch_id, result.cost,
+                                    result.metrics))
+
+                path = "{}-pass-{}-batch-{}-test-{}.tar.gz".format(
+                    args.model_output_prefix, event.pass_id, event.batch_id,
+                    result.cost)
+                with gzip.open(path, 'w') as f:
+                    params.to_tar(f)
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                dataset.train(args.train_data_path), buf_size=500),
+            batch_size=args.batch_size),
+        feeding=reader.feeding_index,
+        event_handler=__event_handler__,
+        num_passes=args.num_passes)
+
+
+if __name__ == '__main__':
+    train()
diff --git a/ctr/utils.py b/ctr/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8cf569cc9b28fa04ee389838cef9edd6862e52b
--- /dev/null
+++ b/ctr/utils.py
@@ -0,0 +1,68 @@
+import logging
+
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+class TaskMode:
+    TRAIN_MODE = 0
+    TEST_MODE = 1
+    INFER_MODE = 2
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_train(self):
+        return self.mode == self.TRAIN_MODE
+
+    def is_test(self):
+        return self.mode == self.TEST_MODE
+
+    def is_infer(self):
+        return self.mode == self.INFER_MODE
+
+    @staticmethod
+    def create_train():
+        return TaskMode(TaskMode.TRAIN_MODE)
+
+    @staticmethod
+    def create_test():
+        return TaskMode(TaskMode.TEST_MODE)
+
+    @staticmethod
+    def create_infer():
+        return TaskMode(TaskMode.INFER_MODE)
+
+
+class ModelType:
+    CLASSIFICATION = 0
+    REGRESSION = 1
+
+    def __init__(self, mode):
+        self.mode = mode
+
+    def is_classification(self):
+        return self.mode == self.CLASSIFICATION
+
+    def is_regression(self):
+        return self.mode == self.REGRESSION
+
+    @staticmethod
+    def create_classification():
+        return ModelType(ModelType.CLASSIFICATION)
+
+    @staticmethod
+    def create_regression():
+        return ModelType(ModelType.REGRESSION)
+
+
+def load_dnn_input_record(sent):
+    return map(int, sent.split())
+
+
+def load_lr_input_record(sent):
+    res = []
+    for _ in [x.split(':') for x in sent.split()]:
+        res.append((int(_[0]), float(_[1]), ))
+    return res
diff --git a/deep_speech_2/.gitignore b/deep_speech_2/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..db0537f3b34cc46b015af3c3d380c369f655f291
--- /dev/null
+++ b/deep_speech_2/.gitignore
@@ -0,0 +1,3 @@
+manifest*
+mean_std.npz
+thirdparty/
diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md
index 3b20bf4944393f9d48177f6fc39aa7f23498cbd7..db07d8c2011465b98048a6e54ce82503223334cb 100644
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -1,15 +1,14 @@
-# Deep Speech 2 on PaddlePaddle
+# DeepSpeech2 on PaddlePaddle
 
-## Installation
+>TODO: to be updated, since the directory hierarchy was changed.
 
-Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
+## Installation
 
 ```
 sh setup.sh
-export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
 ```
 
-For some machines, we also need to install libsndfile1. Details to be added.
+Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
 
 ## Usage
 
@@ -35,15 +34,21 @@ python datasets/librispeech/librispeech.py --help
 ### Preparing for Training
 
 ```
-python compute_mean_std.py
+python tools/compute_mean_std.py
+```
+
+It will compute mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. The default feature of audio data is power spectrum, and the mfcc feature is also supported. To train and infer based on mfcc feature, please generate this file by
+
+```
+python tools/compute_mean_std.py --specgram_type mfcc
 ```
 
-`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing.
+and specify ```--specgram_type mfcc``` when running train.py, infer.py, evaluator.py or tune.py.
 
 More help for arguments:
 
 ```
-python compute_mean_std.py --help
+python tools/compute_mean_std.py --help
 ```
 
 ### Training
@@ -132,3 +137,34 @@ python tune.py --help
 ```
 
 Then reset parameters with the tuning result before inference or evaluating.
+
+### Playing with the ASR Demo
+
+A real-time ASR demo is built for users to try out the ASR model with their own voice. Please do the following installation on the machine you'd like to run the demo's client (no need for the machine running the demo's server).
+
+For example, on MAC OS X:
+
+```
+brew install portaudio
+pip install pyaudio
+pip install pynput
+```
+After a model and language model is prepared, we can first start the demo's server:
+
+```
+CUDA_VISIBLE_DEVICES=0 python demo_server.py
+```
+And then in another console, start the demo's client:
+
+```
+python demo_client.py
+```
+On the client console, press and hold the "white-space" key on the keyboard to start talking, until you finish your speech and then release the "white-space" key. The decoding results (infered transcription) will be displayed.
+
+It could be possible to start the server and the client in two seperate machines, e.g. `demo_client.py` is usually started in a machine with a microphone hardware, while `demo_server.py` is usually started in a remote server with powerful GPUs. Please first make sure that these two machines have network access to each other, and then use `--host_ip` and `--host_port` to indicate the server machine's actual IP address (instead of the `localhost` as default) and TCP port, in both `demo_server.py` and `demo_client.py`.
+
+
+## PaddleCloud Training
+
+If you wish to train DeepSpeech2 on PaddleCloud, please refer to
+[Train DeepSpeech2 on PaddleCloud](https://github.com/PaddlePaddle/models/tree/develop/deep_speech_2/cloud).
diff --git a/deep_speech_2/cloud/README.md b/deep_speech_2/cloud/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a5be1c420880d4f32d472cdd23124cbf35033094
--- /dev/null
+++ b/deep_speech_2/cloud/README.md
@@ -0,0 +1,63 @@
+# Train DeepSpeech2 on PaddleCloud
+
+>Note:
+>Please make sure [PaddleCloud Client](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#%E4%B8%8B%E8%BD%BD%E5%B9%B6%E9%85%8D%E7%BD%AEpaddlecloud) has be installed and current directory is `deep_speech_2/cloud/`
+
+## Step 1:  Upload Data
+
+Provided with several input manifests, `pcloud_upload_data.sh` will pack and upload all the containing audio files to PaddleCloud filesystem, and also generate some corresponding manifest files with updated cloud paths.
+
+Please modify the following arguments in `pcloud_upload_data.sh`:
+
+- `IN_MANIFESTS`： Paths (in local filesystem) of manifest files containing the audio files to be uploaded. Multiple paths can be concatenated with a whitespace delimeter.
+- `OUT_MANIFESTS`: Paths (in local filesystem) to write the updated output manifest files to. Multiple paths can be concatenated with a whitespace delimeter. The values of `audio_filepath` in the output manifests are updated with cloud filesystem paths.
+- `CLOUD_DATA_DIR`:  Directory (in PaddleCloud filesystem) to upload the data to. Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
+- `NUM_SHARDS`: Number of data shards / parts (in tar files) to be generated when packing and uploading data. Smaller `num_shards` requires larger temoporal local disk space for packing data.
+
+By running:
+
+```
+sh pcloud_upload_data.sh
+```
+all the audio files will be uploaded to PaddleCloud filesystem, and you will get modified manifests files in `OUT_MANIFESTS`.
+
+You have to take this step only once, in the very first time you do the cloud training. Later on, the data is persisitent on the cloud filesystem and reusable for further job submissions.
+
+## Step 2:  Configure Training
+
+Configure cloud training arguments in `pcloud_submit.sh`, with the following arguments:
+
+- `TRAIN_MANIFEST`: Manifest filepath (in local filesystem) for training. Notice that the`audio_filepath` should be in cloud filesystem, like those generated by `pcloud_upload_data.sh`.
+- `DEV_MANIFEST`: Manifest filepath (in local filesystem) for validation.
+- `CLOUD_MODEL_DIR`: Directory (in PaddleCloud filesystem) to save the model parameters (checkpoints). Don't forget to replace `USERNAME` in the default directory and make sure that you have the permission to write it.
+- `BATCH_SIZE`: Training batch size for a single node.
+- `NUM_GPU`: Number of GPUs allocated for a single node.
+- `NUM_NODE`: Number of nodes (machines) allocated for this job.
+- `IS_LOCAL`: Set to False to enable parameter server, if using multiple nodes.
+
+Configure other training hyper-parameters in `pcloud_train.sh` as you wish, just as what you can do in local training.
+
+By running:
+
+```
+sh pcloud_submit.sh
+```
+you submit a training job to PaddleCloud. And you will see the job name when the submission is done.
+
+
+## Step 3  Get Job Logs
+
+Run this to list all the jobs you have submitted, as well as their running status:
+
+```
+paddlecloud get jobs
+```
+
+Run this, the corresponding job's logs will be printed.
+```
+paddlecloud logs -n 10000 $REPLACED_WITH_YOUR_ACTUAL_JOB_NAME
+```
+
+## More Help
+
+For more information about the usage of PaddleCloud, please refer to [PaddleCloud Usage](https://github.com/PaddlePaddle/cloud/blob/develop/doc/usage_cn.md#提交任务).
diff --git a/deep_speech_2/cloud/_init_paths.py b/deep_speech_2/cloud/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..3305d7488ff1cfb03db7175a53f70c1a107fe52e
--- /dev/null
+++ b/deep_speech_2/cloud/_init_paths.py
@@ -0,0 +1,17 @@
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = os.path.dirname(__file__)
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
diff --git a/deep_speech_2/cloud/pcloud_submit.sh b/deep_speech_2/cloud/pcloud_submit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..378a7c6e624624af2d3fd004ff41154204a21334
--- /dev/null
+++ b/deep_speech_2/cloud/pcloud_submit.sh
@@ -0,0 +1,29 @@
+#! /usr/bin/bash
+
+TRAIN_MANIFEST="cloud/cloud_manifests/cloud.manifest.train"
+DEV_MANIFEST="cloud/cloud_manifests/cloud.manifest.dev"
+CLOUD_MODEL_DIR="./checkpoints"
+BATCH_SIZE=512
+NUM_GPU=8
+NUM_NODE=1
+IS_LOCAL="True"
+
+JOB_NAME=deepspeech-`date +%Y%m%d%H%M%S`
+DS2_PATH=${PWD%/*}
+cp -f  pcloud_train.sh ${DS2_PATH}
+
+paddlecloud submit \
+-image bootstrapper:5000/paddlepaddle/pcloud_ds2:latest \
+-jobname ${JOB_NAME} \
+-cpu ${NUM_GPU} \
+-gpu ${NUM_GPU} \
+-memory 64Gi \
+-parallelism ${NUM_NODE} \
+-pscpu 1 \
+-pservers 1 \
+-psmemory 64Gi \
+-passes 1 \
+-entry "sh pcloud_train.sh ${TRAIN_MANIFEST} ${DEV_MANIFEST} ${CLOUD_MODEL_DIR} ${NUM_GPU} ${BATCH_SIZE} ${IS_LOCAL}" \
+${DS2_PATH}
+
+rm ${DS2_PATH}/pcloud_train.sh
diff --git a/deep_speech_2/cloud/pcloud_train.sh b/deep_speech_2/cloud/pcloud_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d04132f900ef95539acb6cb7e77205d5108d7600
--- /dev/null
+++ b/deep_speech_2/cloud/pcloud_train.sh
@@ -0,0 +1,44 @@
+#! /usr/bin/bash
+
+TRAIN_MANIFEST=$1
+DEV_MANIFEST=$2
+MODEL_PATH=$3
+NUM_GPU=$4
+BATCH_SIZE=$5
+IS_LOCAL=$6
+
+python ./cloud/split_data.py \
+--in_manifest_path=${TRAIN_MANIFEST} \
+--out_manifest_path='/local.manifest.train'
+
+python ./cloud/split_data.py \
+--in_manifest_path=${DEV_MANIFEST} \
+--out_manifest_path='/local.manifest.dev'
+
+python -u train.py \
+--batch_size=${BATCH_SIZE} \
+--trainer_count=${NUM_GPU} \
+--num_passes=200 \
+--num_proc_data=${NUM_GPU} \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=5e-4 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=${IS_LOCAL} \
+--share_rnn_weights=True \
+--train_manifest='/local.manifest.train' \
+--dev_manifest='/local.manifest.dev' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/eng_vocab.txt' \
+--output_model_dir='./checkpoints' \
+--output_model_dir=${MODEL_PATH} \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped' \
+2>&1 | tee ./log/train.log
diff --git a/deep_speech_2/cloud/pcloud_upload_data.sh b/deep_speech_2/cloud/pcloud_upload_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4ef235ef7da57e5e1f611ddad8b7000528ab46cc
--- /dev/null
+++ b/deep_speech_2/cloud/pcloud_upload_data.sh
@@ -0,0 +1,22 @@
+#! /usr/bin/bash
+
+mkdir cloud_manifests
+
+IN_MANIFESTS="../data/librispeech/manifest.train ../data/librispeech/manifest.dev-clean ../data/librispeech/manifest.test-clean"
+OUT_MANIFESTS="cloud_manifests/cloud.manifest.train cloud_manifests/cloud.manifest.dev cloud_manifests/cloud.manifest.test"
+CLOUD_DATA_DIR="/pfs/dlnel/home/USERNAME/deepspeech2/data/librispeech"
+NUM_SHARDS=50
+
+python upload_data.py \
+--in_manifest_paths ${IN_MANIFESTS} \
+--out_manifest_paths ${OUT_MANIFESTS} \
+--cloud_data_dir ${CLOUD_DATA_DIR} \
+--num_shards ${NUM_SHARDS}
+
+if [ $? -ne 0 ]
+then
+    echo "Upload Data Failed!"
+    exit 1
+fi
+
+echo "All Done."
diff --git a/deep_speech_2/cloud/split_data.py b/deep_speech_2/cloud/split_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..3496d52bfb5bf6c249c03dfb4df2937625bd55b5
--- /dev/null
+++ b/deep_speech_2/cloud/split_data.py
@@ -0,0 +1,41 @@
+"""This tool is used for splitting data into each node of
+paddlecloud. This script should be called in paddlecloud.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import json
+import argparse
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--in_manifest_path",
+    type=str,
+    required=True,
+    help="Input manifest path for all nodes.")
+parser.add_argument(
+    "--out_manifest_path",
+    type=str,
+    required=True,
+    help="Output manifest file path for current node.")
+args = parser.parse_args()
+
+
+def split_data(in_manifest_path, out_manifest_path):
+    with open("/trainer_id", "r") as f:
+        trainer_id = int(f.readline()[:-1])
+    with open("/trainer_count", "r") as f:
+        trainer_count = int(f.readline()[:-1])
+
+    out_manifest = []
+    for index, json_line in enumerate(open(in_manifest_path, 'r')):
+        if (index % trainer_count) == trainer_id:
+            out_manifest.append("%s\n" % json_line.strip())
+    with open(out_manifest_path, 'w') as f:
+        f.writelines(out_manifest)
+
+
+if __name__ == '__main__':
+    split_data(args.in_manifest_path, args.out_manifest_path)
diff --git a/deep_speech_2/cloud/upload_data.py b/deep_speech_2/cloud/upload_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..9973f8c768410fd86a6ded6a74dac24f9f918173
--- /dev/null
+++ b/deep_speech_2/cloud/upload_data.py
@@ -0,0 +1,129 @@
+"""This script is for uploading data for DeepSpeech2 training on paddlecloud.
+
+Steps:
+1. Read original manifests and extract local sound files.
+2. Tar all local sound files into multiple tar files and upload them.
+3. Modify original manifests with updated paths in cloud filesystem.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import tarfile
+import sys
+import argparse
+import shutil
+from subprocess import call
+import _init_paths
+from data_utils.utils import read_manifest
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--in_manifest_paths",
+    default=[
+        "../datasets/manifest.train", "../datasets/manifest.dev",
+        "../datasets/manifest.test"
+    ],
+    type=str,
+    nargs='+',
+    help="Local filepaths of input manifests to load, pack and upload."
+    "(default: %(default)s)")
+parser.add_argument(
+    "--out_manifest_paths",
+    default=[
+        "./cloud.manifest.train", "./cloud.manifest.dev",
+        "./cloud.manifest.test"
+    ],
+    type=str,
+    nargs='+',
+    help="Local filepaths of modified manifests to write to. "
+    "(default: %(default)s)")
+parser.add_argument(
+    "--cloud_data_dir",
+    required=True,
+    type=str,
+    help="Destination directory on paddlecloud to upload data to.")
+parser.add_argument(
+    "--num_shards",
+    default=10,
+    type=int,
+    help="Number of parts to split data to. (default: %(default)s)")
+parser.add_argument(
+    "--local_tmp_dir",
+    default="./tmp/",
+    type=str,
+    help="Local directory for storing temporary data. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def upload_data(in_manifest_path_list, out_manifest_path_list, local_tmp_dir,
+                upload_tar_dir, num_shards):
+    """Extract and pack sound files listed in the manifest files into multple
+    tar files and upload them to padldecloud. Besides, generate new manifest
+    files with updated paths in paddlecloud.
+    """
+    # compute total audio number
+    total_line = 0
+    for manifest_path in in_manifest_path_list:
+        with open(manifest_path, 'r') as f:
+            total_line += len(f.readlines())
+    line_per_tar = (total_line // num_shards) + 1
+
+    # pack and upload shard by shard
+    line_count, tar_file = 0, None
+    for manifest_path, out_manifest_path in zip(in_manifest_path_list,
+                                                out_manifest_path_list):
+        manifest = read_manifest(manifest_path)
+        out_manifest = []
+        for json_data in manifest:
+            sound_filepath = json_data['audio_filepath']
+            sound_filename = os.path.basename(sound_filepath)
+            if line_count % line_per_tar == 0:
+                if tar_file != None:
+                    tar_file.close()
+                    pcloud_cp(tar_path, upload_tar_dir)
+                    os.remove(tar_path)
+                tar_name = 'part-%s-of-%s.tar' % (
+                    str(line_count // line_per_tar).zfill(5),
+                    str(num_shards).zfill(5))
+                tar_path = os.path.join(local_tmp_dir, tar_name)
+                tar_file = tarfile.open(tar_path, 'w')
+            tar_file.add(sound_filepath, arcname=sound_filename)
+            line_count += 1
+            json_data['audio_filepath'] = "tar:%s#%s" % (
+                os.path.join(upload_tar_dir, tar_name), sound_filename)
+            out_manifest.append("%s\n" % json.dumps(json_data))
+        with open(out_manifest_path, 'w') as f:
+            f.writelines(out_manifest)
+        pcloud_cp(out_manifest_path, upload_tar_dir)
+    tar_file.close()
+    pcloud_cp(tar_path, upload_tar_dir)
+    os.remove(tar_path)
+
+
+def pcloud_mkdir(dir):
+    """Make directory in PaddleCloud filesystem.
+    """
+    if call(['paddlecloud', 'mkdir', dir]) != 0:
+        raise IOError("PaddleCloud mkdir failed: %s." % dir)
+
+
+def pcloud_cp(src, dst):
+    """Copy src from local filesytem to dst in PaddleCloud filesystem,
+    or downlowd src from PaddleCloud filesystem to dst in local filesystem.
+    """
+    if call(['paddlecloud', 'cp', src, dst]) != 0:
+        raise IOError("PaddleCloud cp failed: from [%s] to [%s]." % (src, dst))
+
+
+if __name__ == '__main__':
+    if not os.path.exists(args.local_tmp_dir):
+        os.makedirs(args.local_tmp_dir)
+    pcloud_mkdir(args.cloud_data_dir)
+
+    upload_data(args.in_manifest_paths, args.out_manifest_paths,
+                args.local_tmp_dir, args.cloud_data_dir, args.num_shards)
+
+    shutil.rmtree(args.local_tmp_dir)
diff --git a/deep_speech_2/compute_mean_std.py b/deep_speech_2/compute_mean_std.py
deleted file mode 100644
index 9c301c93f6d2ce3ae099caa96830912f76ce6c58..0000000000000000000000000000000000000000
--- a/deep_speech_2/compute_mean_std.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""Compute mean and std for feature normalizer, and save to file."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from data_utils.normalizer import FeatureNormalizer
-from data_utils.augmentor.augmentation import AugmentationPipeline
-from data_utils.featurizer.audio_featurizer import AudioFeaturizer
-
-parser = argparse.ArgumentParser(
-    description='Computing mean and stddev for feature normalizer.')
-parser.add_argument(
-    "--manifest_path",
-    default='datasets/manifest.train',
-    type=str,
-    help="Manifest path for computing normalizer's mean and stddev."
-    "(default: %(default)s)")
-parser.add_argument(
-    "--num_samples",
-    default=2000,
-    type=int,
-    help="Number of samples for computing mean and stddev. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--augmentation_config",
-    default='{}',
-    type=str,
-    help="Augmentation configuration in json-format. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--output_file",
-    default='mean_std.npz',
-    type=str,
-    help="Filepath to write mean and std to (.npz)."
-    "(default: %(default)s)")
-args = parser.parse_args()
-
-
-def main():
-    augmentation_pipeline = AugmentationPipeline(args.augmentation_config)
-    audio_featurizer = AudioFeaturizer()
-
-    def augment_and_featurize(audio_segment):
-        augmentation_pipeline.transform_audio(audio_segment)
-        return audio_featurizer.featurize(audio_segment)
-
-    normalizer = FeatureNormalizer(
-        mean_std_filepath=None,
-        manifest_path=args.manifest_path,
-        featurize_func=augment_and_featurize,
-        num_samples=args.num_samples)
-    normalizer.write_to_file(args.output_file)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/deep_speech_2/conf/augmentation.config b/deep_speech_2/conf/augmentation.config
new file mode 100644
index 0000000000000000000000000000000000000000..6c24da5497460d4bae9c9c4fecdbe96ab8da7532
--- /dev/null
+++ b/deep_speech_2/conf/augmentation.config
@@ -0,0 +1,8 @@
+[
+    {
+        "type": "shift",
+        "params": {"min_shift_ms": -5,
+                   "max_shift_ms": 5},
+        "prob": 1.0
+    }
+]
diff --git a/deep_speech_2/conf/augmentation.config.example b/deep_speech_2/conf/augmentation.config.example
new file mode 100644
index 0000000000000000000000000000000000000000..21ed6ee10375a749f4c072389509db2020d9e9c9
--- /dev/null
+++ b/deep_speech_2/conf/augmentation.config.example
@@ -0,0 +1,39 @@
+[
+    {
+        "type": "noise",
+        "params": {"min_snr_dB": 40,
+                   "max_snr_dB": 50,
+                   "noise_manifest_path": "datasets/manifest.noise"},
+        "prob": 0.6
+    },
+    {
+        "type": "impulse",
+        "params": {"impulse_manifest_path": "datasets/manifest.impulse"},
+        "prob": 0.5
+    },
+    {
+        "type": "speed",
+        "params": {"min_speed_rate": 0.95,
+                   "max_speed_rate": 1.05},
+        "prob": 0.5
+    },
+    {
+        "type": "shift",
+        "params": {"min_shift_ms": -5,
+                   "max_shift_ms": 5},
+        "prob": 1.0
+    },
+    {
+        "type": "volume",
+        "params": {"min_gain_dBFS": -10,
+                   "max_gain_dBFS": 10},
+        "prob": 0.0
+    },
+    {
+        "type": "bayesian_normal",
+        "params": {"target_db": -20,
+                   "prior_db": -20,
+                   "prior_samples": 100},
+        "prob": 0.0
+    }
+]
diff --git a/deep_speech_2/datasets/vocab/eng_vocab.txt b/deep_speech_2/data/librispeech/eng_vocab.txt
similarity index 100%
rename from deep_speech_2/datasets/vocab/eng_vocab.txt
rename to deep_speech_2/data/librispeech/eng_vocab.txt
diff --git a/deep_speech_2/datasets/librispeech/librispeech.py b/deep_speech_2/data/librispeech/librispeech.py
similarity index 97%
rename from deep_speech_2/datasets/librispeech/librispeech.py
rename to deep_speech_2/data/librispeech/librispeech.py
index 87e52ae4aa286503d79f1326065831acfe6bf985..d963a7d5372d64f3abb1dcbdd16dbdafc1888de0 100644
--- a/deep_speech_2/datasets/librispeech/librispeech.py
+++ b/deep_speech_2/data/librispeech/librispeech.py
@@ -11,11 +11,12 @@ from __future__ import print_function
 
 import distutils.util
 import os
-import wget
+import sys
 import tarfile
 import argparse
 import soundfile
 import json
+import codecs
 from paddle.v2.dataset.common import md5file
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@@ -66,7 +67,7 @@ def download(url, md5sum, target_dir):
     filepath = os.path.join(target_dir, url.split("/")[-1])
     if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
         print("Downloading %s ..." % url)
-        wget.download(url, target_dir)
+        os.system("wget -c " + url + " -P " + target_dir)
         print("\nMD5 Chesksum %s ..." % filepath)
         if not md5file(filepath) == md5sum:
             raise RuntimeError("MD5 checksum failed.")
@@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
                         'duration': duration,
                         'text': text
                     }))
-    with open(manifest_path, 'w') as out_file:
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
         for line in json_lines:
             out_file.write(line + '\n')
 
diff --git a/deep_speech_2/data/noise/chime3_background.py b/deep_speech_2/data/noise/chime3_background.py
new file mode 100644
index 0000000000000000000000000000000000000000..f79ca7335bda7aec795bc43c32a51519f3363d85
--- /dev/null
+++ b/deep_speech_2/data/noise/chime3_background.py
@@ -0,0 +1,128 @@
+"""Prepare CHiME3 background data.
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils.util
+import os
+import wget
+import zipfile
+import argparse
+import soundfile
+import json
+from paddle.v2.dataset.common import md5file
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
+MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/chime3_background",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_filepath",
+    default="manifest.chime3.background",
+    type=str,
+    help="Filepath for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def download(url, md5sum, target_dir, filename=None):
+    """Download file from url to target_dir, and check md5sum."""
+    if filename == None:
+        filename = url.split("/")[-1]
+    if not os.path.exists(target_dir): os.makedirs(target_dir)
+    filepath = os.path.join(target_dir, filename)
+    if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
+        print("Downloading %s ..." % url)
+        wget.download(url, target_dir)
+        print("\nMD5 Chesksum %s ..." % filepath)
+        if not md5file(filepath) == md5sum:
+            raise RuntimeError("MD5 checksum failed.")
+    else:
+        print("File exists, skip downloading. (%s)" % filepath)
+    return filepath
+
+
+def unpack(filepath, target_dir):
+    """Unpack the file to the target_dir."""
+    print("Unpacking %s ..." % filepath)
+    if filepath.endswith('.zip'):
+        zip = zipfile.ZipFile(filepath, 'r')
+        zip.extractall(target_dir)
+        zip.close()
+    elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
+        tar = zipfile.open(filepath)
+        tar.extractall(target_dir)
+        tar.close()
+    else:
+        raise ValueError("File format is not supported for unpacking.")
+
+
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    for subfolder, _, filelist in sorted(os.walk(data_dir)):
+        for filename in filelist:
+            if filename.endswith('.wav'):
+                filepath = os.path.join(data_dir, subfolder, filename)
+                audio_data, samplerate = soundfile.read(filepath)
+                duration = float(len(audio_data)) / samplerate
+                json_lines.append(
+                    json.dumps({
+                        'audio_filepath': filepath,
+                        'duration': duration,
+                        'text': ''
+                    }))
+    with open(manifest_path, 'w') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+
+def prepare_chime3(url, md5sum, target_dir, manifest_path):
+    """Download, unpack and create summmary manifest file."""
+    if not os.path.exists(os.path.join(target_dir, "CHiME3")):
+        # download
+        filepath = download(url, md5sum, target_dir,
+                            "myairbridge-AG0Y3DNBE5IWRRTV.zip")
+        # unpack
+        unpack(filepath, target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
+        unpack(
+            os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+    # create manifest json file
+    create_manifest(target_dir, manifest_path)
+
+
+def main():
+    prepare_chime3(
+        url=URL,
+        md5sum=MD5,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_filepath)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/data_utils/audio.py b/deep_speech_2/data_utils/audio.py
index 3891f5b923f6d73c6b87dcb90bede0183b0e081c..30e25221cd84aa6849061635749188e3bd13d67b 100644
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -204,7 +204,7 @@ class AudioSegment(object):
         :raise ValueError: If the sample rates of the two segments are not
                            equal, or if the lengths of segments don't match.
         """
-        if type(self) != type(other):
+        if isinstance(other, type(self)):
             raise TypeError("Cannot add segments of different types: %s "
                             "and %s." % (type(self), type(other)))
         if self._sample_rate != other._sample_rate:
@@ -231,7 +231,7 @@ class AudioSegment(object):
         Note that this is an in-place transformation.
         
         :param gain: Gain in decibels to apply to samples. 
-        :type gain: float
+        :type gain: float|1darray
         """
         self._samples *= 10.**(gain / 20.)
 
@@ -457,9 +457,9 @@ class AudioSegment(object):
                             audio segments when resample is not allowed.
         """
         if allow_resample and self.sample_rate != impulse_segment.sample_rate:
-            impulse_segment = impulse_segment.resample(self.sample_rate)
+            impulse_segment.resample(self.sample_rate)
         if self.sample_rate != impulse_segment.sample_rate:
-            raise ValueError("Impulse segment's sample rate (%d Hz) is not"
+            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
                              "equal to base signal sample rate (%d Hz)." %
                              (impulse_segment.sample_rate, self.sample_rate))
         samples = signal.fftconvolve(self.samples, impulse_segment.samples,
diff --git a/deep_speech_2/data_utils/augmentor/augmentation.py b/deep_speech_2/data_utils/augmentor/augmentation.py
index 9dced47314a81f52dc0eafd6e592e240953f291d..5c30b627ef9a23ff41d1f64f270934f149a793a2 100644
--- a/deep_speech_2/data_utils/augmentor/augmentation.py
+++ b/deep_speech_2/data_utils/augmentor/augmentation.py
@@ -8,6 +8,8 @@ import random
 from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
 from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
 from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
+from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
+from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor
 from data_utils.augmentor.resample import ResampleAugmentor
 from data_utils.augmentor.online_bayesian_normalization import \
      OnlineBayesianNormalizationAugmentor
@@ -23,21 +25,46 @@ class AugmentationPipeline(object):
     string, e.g.
     
     .. code-block::
-        
-        '[{"type": "volume",
-           "params": {"min_gain_dBFS": -15,
-                      "max_gain_dBFS": 15},
-           "prob": 0.5},
-          {"type": "speed",
-           "params": {"min_speed_rate": 0.8,
-                      "max_speed_rate": 1.2},
-           "prob": 0.5}
-         ]' 
 
+        [ {
+                "type": "noise",
+                "params": {"min_snr_dB": 10,
+                           "max_snr_dB": 20,
+                           "noise_manifest_path": "datasets/manifest.noise"},
+                "prob": 0.0
+            },
+            {
+                "type": "speed",
+                "params": {"min_speed_rate": 0.9,
+                           "max_speed_rate": 1.1},
+                "prob": 1.0
+            },
+            {
+                "type": "shift",
+                "params": {"min_shift_ms": -5,
+                           "max_shift_ms": 5},
+                "prob": 1.0
+            },
+            {
+                "type": "volume",
+                "params": {"min_gain_dBFS": -10,
+                           "max_gain_dBFS": 10},
+                "prob": 0.0
+            },
+            {
+                "type": "bayesian_normal",
+                "params": {"target_db": -20,
+                           "prior_db": -20,
+                           "prior_samples": 100},
+                "prob": 0.0
+            }
+        ]
+        
     This augmentation configuration inserts two augmentation models
     into the pipeline, with one is VolumePerturbAugmentor and the other
     SpeedPerturbAugmentor. "prob" indicates the probability of the current
-    augmentor to take effect.
+    augmentor to take effect. If "prob" is zero, the augmentor does not take
+    effect.
 
     :param augmentation_config: Augmentation configuration in json string.
     :type augmentation_config: str
@@ -60,7 +87,7 @@ class AugmentationPipeline(object):
         :type audio_segment: AudioSegmenet|SpeechSegment
         """
         for augmentor, rate in zip(self._augmentors, self._rates):
-            if self._rng.uniform(0., 1.) <= rate:
+            if self._rng.uniform(0., 1.) < rate:
                 augmentor.transform_audio(audio_segment)
 
     def _parse_pipeline_from(self, config_json):
@@ -89,5 +116,9 @@ class AugmentationPipeline(object):
             return ResampleAugmentor(self._rng, **params)
         elif augmentor_type == "bayesian_normal":
             return OnlineBayesianNormalizationAugmentor(self._rng, **params)
+        elif augmentor_type == "noise":
+            return NoisePerturbAugmentor(self._rng, **params)
+        elif augmentor_type == "impulse":
+            return ImpulseResponseAugmentor(self._rng, **params)
         else:
             raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
diff --git a/deep_speech_2/data_utils/augmentor/impulse_response.py b/deep_speech_2/data_utils/augmentor/impulse_response.py
new file mode 100644
index 0000000000000000000000000000000000000000..536b4d6a4a6666359b90e191a3d593250b44e863
--- /dev/null
+++ b/deep_speech_2/data_utils/augmentor/impulse_response.py
@@ -0,0 +1,34 @@
+"""Contains the impulse response augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.augmentor.base import AugmentorBase
+from data_utils.utility import read_manifest
+from data_utils.audio import AudioSegment
+
+
+class ImpulseResponseAugmentor(AugmentorBase):
+    """Augmentation model for adding impulse response effect.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param impulse_manifest_path: Manifest path for impulse audio data.
+    :type impulse_manifest_path: basestring
+    """
+
+    def __init__(self, rng, impulse_manifest_path):
+        self._rng = rng
+        self._impulse_manifest = read_manifest(impulse_manifest_path)
+
+    def transform_audio(self, audio_segment):
+        """Add impulse response effect.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
+        impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
+        audio_segment.convolve(impulse_segment, allow_resample=True)
diff --git a/deep_speech_2/data_utils/augmentor/noise_perturb.py b/deep_speech_2/data_utils/augmentor/noise_perturb.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e0ff4deac48063faf76338014e418e3d8ad4ad
--- /dev/null
+++ b/deep_speech_2/data_utils/augmentor/noise_perturb.py
@@ -0,0 +1,49 @@
+"""Contains the noise perturb augmentation model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from data_utils.augmentor.base import AugmentorBase
+from data_utils.utility import read_manifest
+from data_utils.audio import AudioSegment
+
+
+class NoisePerturbAugmentor(AugmentorBase):
+    """Augmentation model for adding background noise.
+
+    :param rng: Random generator object.
+    :type rng: random.Random
+    :param min_snr_dB: Minimal signal noise ratio, in decibels.
+    :type min_snr_dB: float
+    :param max_snr_dB: Maximal signal noise ratio, in decibels.
+    :type max_snr_dB: float
+    :param noise_manifest_path: Manifest path for noise audio data.
+    :type noise_manifest_path: basestring
+    """
+
+    def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
+        self._min_snr_dB = min_snr_dB
+        self._max_snr_dB = max_snr_dB
+        self._rng = rng
+        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+
+    def transform_audio(self, audio_segment):
+        """Add background noise audio.
+
+        Note that this is an in-place transformation.
+
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        noise_json = self._rng.sample(self._noise_manifest, 1)[0]
+        if noise_json['duration'] < audio_segment.duration:
+            raise RuntimeError("The duration of sampled noise audio is smaller "
+                               "than the audio segment to add effects to.")
+        diff_duration = noise_json['duration'] - audio_segment.duration
+        start = self._rng.uniform(0, diff_duration)
+        end = start + audio_segment.duration
+        noise_segment = AudioSegment.slice_from_file(
+            noise_json['audio_filepath'], start=start, end=end)
+        snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
+        audio_segment.add_noise(
+            noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
diff --git a/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py b/deep_speech_2/data_utils/augmentor/online_bayesian_normalization.py
old mode 100755
new mode 100644
diff --git a/deep_speech_2/data_utils/augmentor/resample.py b/deep_speech_2/data_utils/augmentor/resample.py
old mode 100755
new mode 100644
diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py
index d01ca8cc7a9c08bcbe615e7ea2800751193d1a6e..8bff6826dc51d6caaa420bec5a886e1878f36df4 100644
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -6,10 +6,12 @@ from __future__ import division
 from __future__ import print_function
 
 import random
-import numpy as np
+import tarfile
 import multiprocessing
+import numpy as np
 import paddle.v2 as paddle
-from data_utils import utils
+from threading import local
+from data_utils.utility import read_manifest
 from data_utils.augmentor.augmentation import AugmentationPipeline
 from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
 from data_utils.speech import SpeechSegment
@@ -46,7 +48,7 @@ class DataGenerator(object):
     :param specgram_type: Specgram feature type. Options: 'linear'.
     :type specgram_type: str
     :param use_dB_normalization: Whether to normalize the audio to -20 dB
-                                 before extracting the features.
+                                before extracting the features.
     :type use_dB_normalization: bool
     :param num_threads: Number of CPU threads for processing data.
     :type num_threads: int
@@ -65,7 +67,7 @@ class DataGenerator(object):
                  max_freq=None,
                  specgram_type='linear',
                  use_dB_normalization=True,
-                 num_threads=multiprocessing.cpu_count(),
+                 num_threads=multiprocessing.cpu_count() // 2,
                  random_seed=0):
         self._max_duration = max_duration
         self._min_duration = min_duration
@@ -82,6 +84,27 @@ class DataGenerator(object):
         self._num_threads = num_threads
         self._rng = random.Random(random_seed)
         self._epoch = 0
+        # for caching tar files info
+        self._local_data = local()
+        self._local_data.tar2info = {}
+        self._local_data.tar2object = {}
+
+    def process_utterance(self, filename, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param filename: Audio filepath
+        :type filename: basestring | file
+        :param transcript: Transcription text.
+        :type transcript: basestring
+        :return: Tuple of audio feature tensor and list of token ids for
+                 transcription.
+        :rtype: tuple of (2darray, list)
+        """
+        speech_segment = SpeechSegment.from_file(filename, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, text_ids
 
     def batch_reader_creator(self,
                              manifest_path,
@@ -94,7 +117,7 @@ class DataGenerator(object):
         """
         Batch data reader creator for audio data. Return a callable generator
         function to produce batches of data.
-        
+
         Audio features within one batch will be padded with zeros to have the
         same shape, or a user-defined shape.
 
@@ -136,7 +159,7 @@ class DataGenerator(object):
 
         def batch_reader():
             # read manifest
-            manifest = utils.read_manifest(
+            manifest = read_manifest(
                 manifest_path=manifest_path,
                 max_duration=self._max_duration,
                 min_duration=self._min_duration)
@@ -152,7 +175,7 @@ class DataGenerator(object):
                         manifest, batch_size, clipped=True)
                 elif shuffle_method == "instance_shuffle":
                     self._rng.shuffle(manifest)
-                elif not shuffle_method:
+                elif shuffle_method == None:
                     pass
                 else:
                     raise ValueError("Unknown shuffle method %s." %
@@ -174,9 +197,9 @@ class DataGenerator(object):
     @property
     def feeding(self):
         """Returns data reader's feeding dict.
-        
+
         :return: Data feeding dict.
-        :rtype: dict 
+        :rtype: dict
         """
         return {"audio_spectrogram": 0, "transcript_text": 1}
 
@@ -198,13 +221,37 @@ class DataGenerator(object):
         """
         return self._speech_featurizer.vocab_list
 
-    def _process_utterance(self, filename, transcript):
-        """Load, augment, featurize and normalize for speech data."""
-        speech_segment = SpeechSegment.from_file(filename, transcript)
-        self._augmentation_pipeline.transform_audio(speech_segment)
-        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
-        specgram = self._normalizer.apply(specgram)
-        return specgram, text_ids
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+
+    def _get_file_object(self, file):
+        """Get file object by file path.
+
+        If file startwith tar, it will return a tar file object
+        and cached tar file info for next reading request.
+        It will return file directly, if the type of file is not str.
+        """
+        if file.startswith('tar:'):
+            tarpath, filename = file.split(':', 1)[1].split('#', 1)
+            if 'tar2info' not in self._local_data.__dict__:
+                self._local_data.tar2info = {}
+            if 'tar2object' not in self._local_data.__dict__:
+                self._local_data.tar2object = {}
+            if tarpath not in self._local_data.tar2info:
+                object, infoes = self._parse_tar(tarpath)
+                self._local_data.tar2info[tarpath] = infoes
+                self._local_data.tar2object[tarpath] = object
+            return self._local_data.tar2object[tarpath].extractfile(
+                self._local_data.tar2info[tarpath][filename])
+        else:
+            return open(file, 'r')
 
     def _instance_reader_creator(self, manifest):
         """
@@ -220,8 +267,9 @@ class DataGenerator(object):
                 yield instance
 
         def mapper(instance):
-            return self._process_utterance(instance["audio_filepath"],
-                                           instance["text"])
+            return self.process_utterance(
+                self._get_file_object(instance["audio_filepath"]),
+                instance["text"])
 
         return paddle.reader.xmap_readers(
             mapper, reader, self._num_threads, 1024, order=True)
diff --git a/deep_speech_2/data_utils/featurizer/audio_featurizer.py b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
index 4b4d02c60f4193d753badae1aaa3b17ab3b7ea43..12f8784a9921e9bd78735db3edda3898c54ee908 100644
--- a/deep_speech_2/data_utils/featurizer/audio_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/audio_featurizer.py
@@ -4,15 +4,17 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from data_utils import utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
+from python_speech_features import mfcc
+from python_speech_features import delta
 
 
 class AudioFeaturizer(object):
     """Audio featurizer, for extracting features from audio contents of
     AudioSegment or SpeechSegment.
 
-    Currently, it only supports feature type of linear spectrogram.
+    Currently, it supports feature types of linear spectrogram and mfcc.
 
     :param specgram_type: Specgram feature type. Options: 'linear'.
     :type specgram_type: str
@@ -20,9 +22,10 @@ class AudioFeaturizer(object):
     :type stride_ms: float
     :param window_ms: Window size (in milliseconds) for generating frames.
     :type window_ms: float
-    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+    :param max_freq: When specgram_type is 'linear', only FFT bins
                      corresponding to frequencies between [0, max_freq] are
-                     returned.
+                     returned; when specgram_type is 'mfcc', max_feq is the
+                     highest band edge of mel filters.
     :types max_freq: None|float
     :param target_sample_rate: Audio are resampled (if upsampling or
                                downsampling is allowed) to this before
@@ -54,7 +57,7 @@ class AudioFeaturizer(object):
     def featurize(self,
                   audio_segment,
                   allow_downsampling=True,
-                  allow_upsamplling=True):
+                  allow_upsampling=True):
         """Extract audio features from AudioSegment or SpeechSegment.
 
         :param audio_segment: Audio/speech segment to extract features from.
@@ -91,6 +94,9 @@ class AudioFeaturizer(object):
             return self._compute_linear_specgram(
                 samples, sample_rate, self._stride_ms, self._window_ms,
                 self._max_freq)
+        elif self._specgram_type == 'mfcc':
+            return self._compute_mfcc(samples, sample_rate, self._stride_ms,
+                                      self._window_ms, self._max_freq)
         else:
             raise ValueError("Unknown specgram_type %s. "
                              "Supported values: linear." % self._specgram_type)
@@ -142,3 +148,39 @@ class AudioFeaturizer(object):
         # prepare fft frequency list
         freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
         return fft, freqs
+
+    def _compute_mfcc(self,
+                      samples,
+                      sample_rate,
+                      stride_ms=10.0,
+                      window_ms=20.0,
+                      max_freq=None):
+        """Compute mfcc from samples."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # compute the 13 cepstral coefficients, and the first one is replaced
+        # by log(frame energy)
+        mfcc_feat = mfcc(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            highfreq=max_freq)
+        # Deltas
+        d_mfcc_feat = delta(mfcc_feat, 2)
+        # Deltas-Deltas
+        dd_mfcc_feat = delta(d_mfcc_feat, 2)
+        # transpose
+        mfcc_feat = np.transpose(mfcc_feat)
+        d_mfcc_feat = np.transpose(d_mfcc_feat)
+        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
+        # concat above three features
+        concat_mfcc_feat = np.concatenate(
+            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
+        return concat_mfcc_feat
diff --git a/deep_speech_2/data_utils/featurizer/speech_featurizer.py b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
index 26283892e85beb8b41351fb2d1b876c6284da887..a947588db4a29d7d49b9650c2da28731259cc0e0 100644
--- a/deep_speech_2/data_utils/featurizer/speech_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/speech_featurizer.py
@@ -11,23 +11,24 @@ class SpeechFeaturizer(object):
     """Speech featurizer, for extracting features from both audio and transcript
     contents of SpeechSegment.
 
-    Currently, for audio parts, it only supports feature type of linear
-    spectrogram; for transcript parts, it only supports char-level tokenizing
-    and conversion into a list of token indices. Note that the token indexing
-    order follows the given vocabulary file.
+    Currently, for audio parts, it supports feature types of linear
+    spectrogram and mfcc; for transcript parts, it only supports char-level
+    tokenizing and conversion into a list of token indices. Note that the
+    token indexing order follows the given vocabulary file.
 
     :param vocab_filepath: Filepath to load vocabulary for token indices
                            conversion.
     :type specgram_type: basestring
-    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
     :type specgram_type: str
     :param stride_ms: Striding size (in milliseconds) for generating frames.
     :type stride_ms: float
     :param window_ms: Window size (in milliseconds) for generating frames.
     :type window_ms: float
-    :param max_freq: Used when specgram_type is 'linear', only FFT bins
+    :param max_freq: When specgram_type is 'linear', only FFT bins
                      corresponding to frequencies between [0, max_freq] are
-                     returned.
+                     returned; when specgram_type is 'mfcc', max_freq is the
+                     highest band edge of mel filters.
     :types max_freq: None|float
     :param target_sample_rate: Speech are resampled (if upsampling or
                                downsampling is allowed) to this before
diff --git a/deep_speech_2/data_utils/featurizer/text_featurizer.py b/deep_speech_2/data_utils/featurizer/text_featurizer.py
index 4f9a49b594010f91a64797b9a4b7e9054d4749d5..89202163ca8d8b69f59b858db5451882d7e089b3 100644
--- a/deep_speech_2/data_utils/featurizer/text_featurizer.py
+++ b/deep_speech_2/data_utils/featurizer/text_featurizer.py
@@ -4,6 +4,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import codecs
 
 
 class TextFeaturizer(object):
@@ -59,7 +60,7 @@ class TextFeaturizer(object):
     def _load_vocabulary_from_file(self, vocab_filepath):
         """Load vocabulary from file."""
         vocab_lines = []
-        with open(vocab_filepath, 'r') as file:
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
             vocab_lines.extend(file.readlines())
         vocab_list = [line[:-1] for line in vocab_lines]
         vocab_dict = dict(
diff --git a/deep_speech_2/data_utils/normalizer.py b/deep_speech_2/data_utils/normalizer.py
index c123d25d20600140b47da1e93655b15c0053dfea..7c2e05c9d85fa55c0a91386ebf9ba570b2ec0e3b 100644
--- a/deep_speech_2/data_utils/normalizer.py
+++ b/deep_speech_2/data_utils/normalizer.py
@@ -5,7 +5,7 @@ from __future__ import print_function
 
 import numpy as np
 import random
-import data_utils.utils as utils
+from data_utils.utility import read_manifest
 from data_utils.audio import AudioSegment
 
 
@@ -16,7 +16,7 @@ class FeatureNormalizer(object):
     if mean_std_filepath is provided (not None), the normalizer will directly
     initilize from the file. Otherwise, both manifest_path and featurize_func
     should be given for on-the-fly mean and stddev computing.
-    
+
     :param mean_std_filepath: File containing the pre-computed mean and stddev.
     :type mean_std_filepath: None|basestring
     :param manifest_path: Manifest of instances for computing mean and stddev.
@@ -75,7 +75,7 @@ class FeatureNormalizer(object):
 
     def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
         """Compute mean and std from randomly sampled instances."""
-        manifest = utils.read_manifest(manifest_path)
+        manifest = read_manifest(manifest_path)
         sampled_manifest = self._rng.sample(manifest, num_samples)
         features = []
         for instance in sampled_manifest:
diff --git a/deep_speech_2/data_utils/speech.py b/deep_speech_2/data_utils/speech.py
index 568e4443ba557149505dfb4de6f230b4962e332a..17d68f315d04b6cc1aae2346df78cf77982cd7bc 100644
--- a/deep_speech_2/data_utils/speech.py
+++ b/deep_speech_2/data_utils/speech.py
@@ -115,7 +115,7 @@ class SpeechSegment(AudioSegment):
                  speech file.
         :rtype: SpeechSegment
         """
-        audio = Audiosegment.slice_from_file(filepath, start, end)
+        audio = AudioSegment.slice_from_file(filepath, start, end)
         return cls(audio.samples, audio.sample_rate, transcript)
 
     @classmethod
diff --git a/deep_speech_2/data_utils/utils.py b/deep_speech_2/data_utils/utility.py
similarity index 88%
rename from deep_speech_2/data_utils/utils.py
rename to deep_speech_2/data_utils/utility.py
index 3f1165718aa0e2a0bf0687b8a613a6447b964ee8..f970ff55adeee0e1a4613143db1145e617b3699c 100644
--- a/deep_speech_2/data_utils/utils.py
+++ b/deep_speech_2/data_utils/utility.py
@@ -4,15 +4,16 @@ from __future__ import division
 from __future__ import print_function
 
 import json
+import codecs
 
 
 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
     """Load and parse manifest file.
-    
+
     Instances with durations outside [min_duration, max_duration] will be
     filtered out.
 
-    :param manifest_path: Manifest file to load and parse. 
+    :param manifest_path: Manifest file to load and parse.
     :type manifest_path: basestring
     :param max_duration: Maximal duration in seconds for instance filter.
     :type max_duration: float
@@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
     :raises IOError: If failed to parse the manifest.
     """
     manifest = []
-    for json_line in open(manifest_path):
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
         try:
             json_data = json.loads(json_line)
         except Exception as e:
diff --git a/deep_speech_2/datasets/run_all.sh b/deep_speech_2/datasets/run_all.sh
deleted file mode 100644
index ef2b721fbdc2a18fcbc208730189604e88d7ef2c..0000000000000000000000000000000000000000
--- a/deep_speech_2/datasets/run_all.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-cd librispeech
-python librispeech.py
-if [ $? -ne 0 ]; then
-    echo "Prepare LibriSpeech failed. Terminated."
-    exit 1
-fi
-cd -
-
-cat librispeech/manifest.train* | shuf > manifest.train
-cat librispeech/manifest.dev-clean > manifest.dev
-cat librispeech/manifest.test-clean > manifest.test
-
-echo "All done."
diff --git a/deep_speech_2/deploy/_init_paths.py b/deep_speech_2/deploy/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddabb535be682d95c3c8b73003ea30eed06ca0b0
--- /dev/null
+++ b/deep_speech_2/deploy/_init_paths.py
@@ -0,0 +1,19 @@
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = os.path.dirname(__file__)
+
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
diff --git a/deep_speech_2/deploy/demo_client.py b/deep_speech_2/deploy/demo_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf4dd1bf3f5ea62661e181e0dd2fb3f3b1379c6
--- /dev/null
+++ b/deep_speech_2/deploy/demo_client.py
@@ -0,0 +1,94 @@
+"""Client-end for the ASR demo."""
+from pynput import keyboard
+import struct
+import socket
+import sys
+import argparse
+import pyaudio
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--host_ip",
+    default="localhost",
+    type=str,
+    help="Server IP address. (default: %(default)s)")
+parser.add_argument(
+    "--host_port",
+    default=8086,
+    type=int,
+    help="Server Port. (default: %(default)s)")
+args = parser.parse_args()
+
+is_recording = False
+enable_trigger_record = True
+
+
+def on_press(key):
+    """On-press keyboard callback function."""
+    global is_recording, enable_trigger_record
+    if key == keyboard.Key.space:
+        if (not is_recording) and enable_trigger_record:
+            sys.stdout.write("Start Recording ... ")
+            sys.stdout.flush()
+            is_recording = True
+
+
+def on_release(key):
+    """On-release keyboard callback function."""
+    global is_recording, enable_trigger_record
+    if key == keyboard.Key.esc:
+        return False
+    elif key == keyboard.Key.space:
+        if is_recording == True:
+            is_recording = False
+
+
+data_list = []
+
+
+def callback(in_data, frame_count, time_info, status):
+    """Audio recorder's stream callback function."""
+    global data_list, is_recording, enable_trigger_record
+    if is_recording:
+        data_list.append(in_data)
+        enable_trigger_record = False
+    elif len(data_list) > 0:
+        # Connect to server and send data
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.connect((args.host_ip, args.host_port))
+        sent = ''.join(data_list)
+        sock.sendall(struct.pack('>i', len(sent)) + sent)
+        print('Speech[length=%d] Sent.' % len(sent))
+        # Receive data from the server and shut down
+        received = sock.recv(1024)
+        print "Recognition Results: {}".format(received)
+        sock.close()
+        data_list = []
+    enable_trigger_record = True
+    return (in_data, pyaudio.paContinue)
+
+
+def main():
+    # prepare audio recorder
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=pyaudio.paInt32,
+        channels=1,
+        rate=16000,
+        input=True,
+        stream_callback=callback)
+    stream.start_stream()
+
+    # prepare keyboard listener
+    with keyboard.Listener(
+            on_press=on_press, on_release=on_release) as listener:
+        listener.join()
+
+    # close up
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deep_speech_2/deploy/demo_server.py b/deep_speech_2/deploy/demo_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..658b14197bee037429032bf87de70ee78a3edcab
--- /dev/null
+++ b/deep_speech_2/deploy/demo_server.py
@@ -0,0 +1,200 @@
+"""Server-end for the ASR demo."""
+import os
+import time
+import random
+import argparse
+import functools
+from time import gmtime, strftime
+import SocketServer
+import struct
+import wave
+import paddle.v2 as paddle
+import _init_paths
+from data_utils.data import DataGenerator
+from models.model import DeepSpeech2Model
+from data_utils.utils import read_manifest
+from utils.utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('host_port',        int,    8086,    "Server's IP port.")
+add_arg('beam_size',        int,    500,    "Beam search width.")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
+add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
+                                            "bi-directional RNNs. Not for GRU.")
+add_arg('host_ip',          str,
+        'localhost',
+        "Server's IP address.")
+add_arg('speech_save_dir',  str,
+        'demo_cache',
+        "Directory to save demo audios.")
+add_arg('warmup_manifest',  str,
+        'data/librispeech/manifest.test-clean',
+        "Filepath of manifest to warm up.")
+add_arg('mean_std_path',    str,
+        'data/librispeech/mean_std.npz',
+        "Filepath of normalizer's mean & std.")
+add_arg('vocab_path',       str,
+        'data/librispeech/eng_vocab.txt',
+        "Filepath of vocabulary.")
+add_arg('model_path',       str,
+        './checkpoints/params.latest.tar.gz',
+        "If None, the training starts from scratch, "
+        "otherwise, it resumes from the pre-trained model.")
+add_arg('lang_model_path',  str,
+        'lm/data/common_crawl_00.prune01111.trie.klm',
+        "Filepath for language model.")
+add_arg('decoding_method',  str,
+        'ctc_beam_search',
+        "Decoding method. Options: ctc_beam_search, ctc_greedy",
+        choices = ['ctc_beam_search', 'ctc_greedy'])
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+# yapf: disable
+args = parser.parse_args()
+
+
+class AsrTCPServer(SocketServer.TCPServer):
+    """The ASR TCP Server."""
+
+    def __init__(self,
+                 server_address,
+                 RequestHandlerClass,
+                 speech_save_dir,
+                 audio_process_handler,
+                 bind_and_activate=True):
+        self.speech_save_dir = speech_save_dir
+        self.audio_process_handler = audio_process_handler
+        SocketServer.TCPServer.__init__(
+            self, server_address, RequestHandlerClass, bind_and_activate=True)
+
+
+class AsrRequestHandler(SocketServer.BaseRequestHandler):
+    """The ASR request handler."""
+
+    def handle(self):
+        # receive data through TCP socket
+        chunk = self.request.recv(1024)
+        target_len = struct.unpack('>i', chunk[:4])[0]
+        data = chunk[4:]
+        while len(data) < target_len:
+            chunk = self.request.recv(1024)
+            data += chunk
+        # write to file
+        filename = self._write_to_file(data)
+
+        print("Received utterance[length=%d] from %s, saved to %s." %
+              (len(data), self.client_address[0], filename))
+        start_time = time.time()
+        transcript = self.server.audio_process_handler(filename)
+        finish_time = time.time()
+        print("Response Time: %f, Transcript: %s" %
+              (finish_time - start_time, transcript))
+        self.request.sendall(transcript)
+
+    def _write_to_file(self, data):
+        # prepare save dir and filename
+        if not os.path.exists(self.server.speech_save_dir):
+            os.mkdir(self.server.speech_save_dir)
+        timestamp = strftime("%Y%m%d%H%M%S", gmtime())
+        out_filename = os.path.join(
+            self.server.speech_save_dir,
+            timestamp + "_" + self.client_address[0] + ".wav")
+        # write to wav file
+        file = wave.open(out_filename, 'wb')
+        file.setnchannels(1)
+        file.setsampwidth(4)
+        file.setframerate(16000)
+        file.writeframes(data)
+        file.close()
+        return out_filename
+
+
+def warm_up_test(audio_process_handler,
+                 manifest_path,
+                 num_test_cases,
+                 random_seed=0):
+    """Warming-up test."""
+    manifest = read_manifest(manifest_path)
+    rng = random.Random(random_seed)
+    samples = rng.sample(manifest, num_test_cases)
+    for idx, sample in enumerate(samples):
+        print("Warm-up Test Case %d: %s", idx, sample['audio_filepath'])
+        start_time = time.time()
+        transcript = audio_process_handler(sample['audio_filepath'])
+        finish_time = time.time()
+        print("Response Time: %f, Transcript: %s" %
+              (finish_time - start_time, transcript))
+
+
+def start_server():
+    """Start the ASR server"""
+    # prepare data generator
+    data_generator = DataGenerator(
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
+        augmentation_config='{}',
+        specgram_type=args.specgram_type,
+        num_threads=1)
+    # prepare ASR model
+    ds2_model = DeepSpeech2Model(
+        vocab_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_layer_size=args.rnn_layer_size,
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
+
+    # prepare ASR inference handler
+    def file_to_transcript(filename):
+        feature = data_generator.process_utterance(filename, "")
+        result_transcript = ds2_model.infer_batch(
+            infer_data=[feature],
+            decoding_method=args.decoding_method,
+            beam_alpha=args.alpha,
+            beam_beta=args.beta,
+            beam_size=args.beam_size,
+            cutoff_prob=args.cutoff_prob,
+            vocab_list=data_generator.vocab_list,
+            language_model_path=args.lang_model_path,
+            num_processes=1)
+        return result_transcript[0]
+
+    # warming up with utterrances sampled from Librispeech
+    print('-----------------------------------------------------------')
+    print('Warming up ...')
+    warm_up_test(
+        audio_process_handler=file_to_transcript,
+        manifest_path=args.warmup_manifest,
+        num_test_cases=3)
+    print('-----------------------------------------------------------')
+
+    # start the server
+    server = AsrTCPServer(
+        server_address=(args.host_ip, args.host_port),
+        RequestHandlerClass=AsrRequestHandler,
+        speech_save_dir=args.speech_save_dir,
+        audio_process_handler=file_to_transcript)
+    print("ASR Server Started.")
+    server.serve_forever()
+
+
+def main():
+    print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
+    start_server()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deep_speech_2/evaluate.py b/deep_speech_2/evaluate.py
deleted file mode 100644
index 00516dcbf00de146677dfc1122125346a52ebe92..0000000000000000000000000000000000000000
--- a/deep_speech_2/evaluate.py
+++ /dev/null
@@ -1,205 +0,0 @@
-"""Evaluation for DeepSpeech2 model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import distutils.util
-import argparse
-import gzip
-import paddle.v2 as paddle
-from data_utils.data import DataGenerator
-from model import deep_speech2
-from decoder import *
-from lm.lm_scorer import LmScorer
-from error_rate import wer
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--batch_size",
-    default=100,
-    type=int,
-    help="Minibatch size for evaluation. (default: %(default)s)")
-parser.add_argument(
-    "--num_conv_layers",
-    default=2,
-    type=int,
-    help="Convolution layer number. (default: %(default)s)")
-parser.add_argument(
-    "--num_rnn_layers",
-    default=3,
-    type=int,
-    help="RNN layer number. (default: %(default)s)")
-parser.add_argument(
-    "--rnn_layer_size",
-    default=512,
-    type=int,
-    help="RNN layer cell number. (default: %(default)s)")
-parser.add_argument(
-    "--use_gpu",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use gpu or not. (default: %(default)s)")
-parser.add_argument(
-    "--num_threads_data",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
-parser.add_argument(
-    "--num_processes_beam_search",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu processes for beam search. (default: %(default)s)")
-parser.add_argument(
-    "--mean_std_filepath",
-    default='mean_std.npz',
-    type=str,
-    help="Manifest path for normalizer. (default: %(default)s)")
-parser.add_argument(
-    "--decode_method",
-    default='beam_search',
-    type=str,
-    help="Method for ctc decoding, best_path or beam_search. (default: %(default)s)"
-)
-parser.add_argument(
-    "--language_model_path",
-    default="lm/data/common_crawl_00.prune01111.trie.klm",
-    type=str,
-    help="Path for language model. (default: %(default)s)")
-parser.add_argument(
-    "--alpha",
-    default=0.26,
-    type=float,
-    help="Parameter associated with language model. (default: %(default)f)")
-parser.add_argument(
-    "--beta",
-    default=0.1,
-    type=float,
-    help="Parameter associated with word count. (default: %(default)f)")
-parser.add_argument(
-    "--cutoff_prob",
-    default=0.99,
-    type=float,
-    help="The cutoff probability of pruning"
-    "in beam search. (default: %(default)f)")
-parser.add_argument(
-    "--beam_size",
-    default=500,
-    type=int,
-    help="Width for beam search decoding. (default: %(default)d)")
-parser.add_argument(
-    "--decode_manifest_path",
-    default='datasets/manifest.test',
-    type=str,
-    help="Manifest path for decoding. (default: %(default)s)")
-parser.add_argument(
-    "--model_filepath",
-    default='checkpoints/params.latest.tar.gz',
-    type=str,
-    help="Model filepath. (default: %(default)s)")
-parser.add_argument(
-    "--vocab_filepath",
-    default='datasets/vocab/eng_vocab.txt',
-    type=str,
-    help="Vocabulary filepath. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def evaluate():
-    """Evaluate on whole test data for DeepSpeech2."""
-    # initialize data generator
-    data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
-        mean_std_filepath=args.mean_std_filepath,
-        augmentation_config='{}',
-        num_threads=args.num_threads_data)
-
-    # create network config
-    # paddle.data_type.dense_array is used for variable batch input.
-    # The size 161 * 161 is only an placeholder value and the real shape
-    # of input batch data will be induced during training.
-    audio_data = paddle.layer.data(
-        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
-    text_data = paddle.layer.data(
-        name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
-    output_probs = deep_speech2(
-        audio_data=audio_data,
-        text_data=text_data,
-        dict_size=data_generator.vocab_size,
-        num_conv_layers=args.num_conv_layers,
-        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size,
-        is_inference=True)
-
-    # load parameters
-    parameters = paddle.parameters.Parameters.from_tar(
-        gzip.open(args.model_filepath))
-
-    # prepare infer data
-    batch_reader = data_generator.batch_reader_creator(
-        manifest_path=args.decode_manifest_path,
-        batch_size=args.batch_size,
-        min_batch_size=1,
-        sortagrad=False,
-        shuffle_method=None)
-
-    # define inferer
-    inferer = paddle.inference.Inference(
-        output_layer=output_probs, parameters=parameters)
-
-    # initialize external scorer for beam search decoding
-    if args.decode_method == 'beam_search':
-        ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
-
-    wer_counter, wer_sum = 0, 0.0
-    for infer_data in batch_reader():
-        # run inference
-        infer_results = inferer.infer(input=infer_data)
-        num_steps = len(infer_results) // len(infer_data)
-        probs_split = [
-            infer_results[i * num_steps:(i + 1) * num_steps]
-            for i in xrange(0, len(infer_data))
-        ]
-        # target transcription
-        target_transcription = [
-            ''.join([
-                data_generator.vocab_list[index] for index in infer_data[i][1]
-            ]) for i, probs in enumerate(probs_split)
-        ]
-        # decode and print
-        # best path decode
-        if args.decode_method == "best_path":
-            for i, probs in enumerate(probs_split):
-                output_transcription = ctc_best_path_decoder(
-                    probs_seq=probs, vocabulary=data_generator.vocab_list)
-                wer_sum += wer(target_transcription[i], output_transcription)
-                wer_counter += 1
-        # beam search decode
-        elif args.decode_method == "beam_search":
-            # beam search using multiple processes
-            beam_search_results = ctc_beam_search_decoder_batch(
-                probs_split=probs_split,
-                vocabulary=data_generator.vocab_list,
-                beam_size=args.beam_size,
-                blank_id=len(data_generator.vocab_list),
-                num_processes=args.num_processes_beam_search,
-                ext_scoring_func=ext_scorer,
-                cutoff_prob=args.cutoff_prob, )
-            for i, beam_search_result in enumerate(beam_search_results):
-                wer_sum += wer(target_transcription[i],
-                               beam_search_result[0][1])
-                wer_counter += 1
-        else:
-            raise ValueError("Decoding method [%s] is not supported." %
-                             decode_method)
-
-    print("Final WER = %f" % (wer_sum / wer_counter))
-
-
-def main():
-    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
-    evaluate()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/deep_speech_2/examples/librispeech/generate.sh b/deep_speech_2/examples/librispeech/generate.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a34b7bc1009e1ce32ca676898d72064d9022f2ab
--- /dev/null
+++ b/deep_speech_2/examples/librispeech/generate.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+CUDA_VISIBLE_DEVICES=0 \
+python -u infer.py \
+--num_samples=10 \
+--trainer_count=1 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_proc_data=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--infer_manifest='data/librispeech/manifest.dev-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/eng_vocab.txt' \
+--model_path='checkpoints/params.latest.tar.gz' \
+--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
diff --git a/deep_speech_2/examples/librispeech/prepare_data.sh b/deep_speech_2/examples/librispeech/prepare_data.sh
new file mode 100644
index 0000000000000000000000000000000000000000..10b85d0580e4dd87c583a0be478830c01f53fb17
--- /dev/null
+++ b/deep_speech_2/examples/librispeech/prepare_data.sh
@@ -0,0 +1,32 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+# download data, generate manifests
+python data/librispeech/librispeech.py \
+--manifest_prefix='data/librispeech/manifest' \
+--full_download='True' \
+--target_dir=$HOME'/.cache/paddle/dataset/speech/Libri'
+
+if [ $? -ne 0 ]; then
+    echo "Prepare LibriSpeech failed. Terminated."
+    exit 1
+fi
+
+#cat data/librispeech/manifest.train* | shuf > data/librispeech/manifest.train
+
+
+# compute mean and stddev for normalizer
+python tools/compute_mean_std.py \
+--manifest_path='data/librispeech/manifest.train' \
+--num_samples=2000 \
+--specgram_type='linear' \
+--output_path='data/librispeech/mean_std.npz'
+
+if [ $? -ne 0 ]; then
+    echo "Compute mean and stddev failed. Terminated."
+    exit 1
+fi
+
+
+echo "LibriSpeech Data preparation done."
diff --git a/deep_speech_2/examples/librispeech/run_test.sh b/deep_speech_2/examples/librispeech/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5a14cb682135bac8ce6097acfd07b5f2f615c1ba
--- /dev/null
+++ b/deep_speech_2/examples/librispeech/run_test.sh
@@ -0,0 +1,28 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u evaluate.py \
+--batch_size=128 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_proc_data=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--alpha=0.36 \
+--beta=0.25 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--test_manifest='data/librispeech/manifest.test-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/eng_vocab.txt' \
+--model_path='checkpoints/params.latest.tar.gz' \
+--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--decoding_method='ctc_beam_search' \
+--error_rate_type='wer' \
+--specgram_type='linear'
diff --git a/deep_speech_2/examples/librispeech/run_train.sh b/deep_speech_2/examples/librispeech/run_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..832838a813e3995d9142dc8ebbbab185024e5c11
--- /dev/null
+++ b/deep_speech_2/examples/librispeech/run_train.sh
@@ -0,0 +1,30 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u train.py \
+--batch_size=256 \
+--trainer_count=8 \
+--num_passes=200 \
+--num_proc_data=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_iter_print=100 \
+--learning_rate=5e-4 \
+--max_duration=27.0 \
+--min_duration=0.0 \
+--use_sortagrad=True \
+--use_gru=False \
+--use_gpu=True \
+--is_local=True \
+--share_rnn_weights=True \
+--train_manifest='data/librispeech/manifest.train' \
+--dev_manifest='data/librispeech/manifest.dev' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/eng_vocab.txt' \
+--output_model_dir='./checkpoints' \
+--augment_conf_path='conf/augmentation.config' \
+--specgram_type='linear' \
+--shuffle_method='batch_shuffle_clipped'
diff --git a/deep_speech_2/examples/librispeech/run_tune.sh b/deep_speech_2/examples/librispeech/run_tune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9d992e8842a3c9d434a32669d72df722a211c61b
--- /dev/null
+++ b/deep_speech_2/examples/librispeech/run_tune.sh
@@ -0,0 +1,30 @@
+#! /usr/bin/bash
+
+pushd ../..
+
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+python -u tools/tune.py \
+--num_samples=100 \
+--trainer_count=8 \
+--beam_size=500 \
+--num_proc_bsearch=12 \
+--num_conv_layers=2 \
+--num_rnn_layers=3 \
+--rnn_layer_size=2048 \
+--num_alphas=14 \
+--num_betas=20 \
+--alpha_from=0.1 \
+--alpha_to=0.36 \
+--beta_from=0.05 \
+--beta_to=1.0 \
+--cutoff_prob=0.99 \
+--use_gru=False \
+--use_gpu=True \
+--share_rnn_weights=True \
+--tune_manifest='data/librispeech/manifest.dev-clean' \
+--mean_std_path='data/librispeech/mean_std.npz' \
+--vocab_path='data/librispeech/eng_vocab.txt' \
+--model_path='checkpoints/params.latest.tar.gz' \
+--lang_model_path='lm/data/common_crawl_00.prune01111.trie.klm' \
+--error_rate_type='wer' \
+--specgram_type='linear'
diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py
index bb81feac163993848541835fe4dbbf4285727cde..1ce969ae07b649a4b2d2669683b3ae537bb8edc2 100644
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -4,213 +4,112 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import gzip
-import distutils.util
-import multiprocessing
+import functools
 import paddle.v2 as paddle
 from data_utils.data import DataGenerator
-from model import deep_speech2
-from decoder import *
-from lm.lm_scorer import LmScorer
-from error_rate import wer
-import utils
+from models.model import DeepSpeech2Model
+from utils.error_rate import wer, cer
+from utils.utility import add_arguments, print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--num_samples",
-    default=10,
-    type=int,
-    help="Number of samples for inference. (default: %(default)s)")
-parser.add_argument(
-    "--num_conv_layers",
-    default=2,
-    type=int,
-    help="Convolution layer number. (default: %(default)s)")
-parser.add_argument(
-    "--num_rnn_layers",
-    default=3,
-    type=int,
-    help="RNN layer number. (default: %(default)s)")
-parser.add_argument(
-    "--rnn_layer_size",
-    default=512,
-    type=int,
-    help="RNN layer cell number. (default: %(default)s)")
-parser.add_argument(
-    "--use_gpu",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use gpu or not. (default: %(default)s)")
-parser.add_argument(
-    "--num_threads_data",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
-parser.add_argument(
-    "--num_processes_beam_search",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu processes for beam search. (default: %(default)s)")
-parser.add_argument(
-    "--mean_std_filepath",
-    default='mean_std.npz',
-    type=str,
-    help="Manifest path for normalizer. (default: %(default)s)")
-parser.add_argument(
-    "--decode_manifest_path",
-    default='datasets/manifest.test',
-    type=str,
-    help="Manifest path for decoding. (default: %(default)s)")
-parser.add_argument(
-    "--model_filepath",
-    default='checkpoints/params.latest.tar.gz',
-    type=str,
-    help="Model filepath. (default: %(default)s)")
-parser.add_argument(
-    "--vocab_filepath",
-    default='datasets/vocab/eng_vocab.txt',
-    type=str,
-    help="Vocabulary filepath. (default: %(default)s)")
-parser.add_argument(
-    "--decode_method",
-    default='beam_search',
-    type=str,
-    help="Method for ctc decoding: best_path or beam_search. (default: %(default)s)"
-)
-parser.add_argument(
-    "--beam_size",
-    default=500,
-    type=int,
-    help="Width for beam search decoding. (default: %(default)d)")
-parser.add_argument(
-    "--num_results_per_sample",
-    default=1,
-    type=int,
-    help="Number of output per sample in beam search. (default: %(default)d)")
-parser.add_argument(
-    "--language_model_path",
-    default="lm/data/common_crawl_00.prune01111.trie.klm",
-    type=str,
-    help="Path for language model. (default: %(default)s)")
-parser.add_argument(
-    "--alpha",
-    default=0.26,
-    type=float,
-    help="Parameter associated with language model. (default: %(default)f)")
-parser.add_argument(
-    "--beta",
-    default=0.1,
-    type=float,
-    help="Parameter associated with word count. (default: %(default)f)")
-parser.add_argument(
-    "--cutoff_prob",
-    default=0.99,
-    type=float,
-    help="The cutoff probability of pruning"
-    "in beam search. (default: %(default)f)")
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('num_samples',      int,    10,     "# of samples to infer.")
+add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
+add_arg('beam_size',        int,    500,    "Beam search width.")
+add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
+add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
+                                            "bi-directional RNNs. Not for GRU.")
+add_arg('infer_manifest',   str,
+        'data/librispeech/manifest.dev-clean',
+        "Filepath of manifest to infer.")
+add_arg('mean_std_path',    str,
+        'data/librispeech/mean_std.npz',
+        "Filepath of normalizer's mean & std.")
+add_arg('vocab_path',       str,
+        'data/librispeech/eng_vocab.txt',
+        "Filepath of vocabulary.")
+add_arg('lang_model_path',  str,
+        'lm/data/common_crawl_00.prune01111.trie.klm',
+        "Filepath for language model.")
+add_arg('model_path',       str,
+        './checkpoints/params.latest.tar.gz',
+        "If None, the training starts from scratch, "
+        "otherwise, it resumes from the pre-trained model.")
+add_arg('decoding_method',  str,
+        'ctc_beam_search',
+        "Decoding method. Options: ctc_beam_search, ctc_greedy",
+        choices = ['ctc_beam_search', 'ctc_greedy'])
+add_arg('error_rate_type',  str,
+        'wer',
+        "Error rate type for evaluation.",
+        choices=['wer', 'cer'])
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+# yapf: disable
 args = parser.parse_args()
 
 
 def infer():
     """Inference for DeepSpeech2."""
-    # initialize data generator
     data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
-        mean_std_filepath=args.mean_std_filepath,
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
         augmentation_config='{}',
-        num_threads=args.num_threads_data)
-
-    # create network config
-    # paddle.data_type.dense_array is used for variable batch input.
-    # The size 161 * 161 is only an placeholder value and the real shape
-    # of input batch data will be induced during training.
-    audio_data = paddle.layer.data(
-        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
-    text_data = paddle.layer.data(
-        name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
-    output_probs = deep_speech2(
-        audio_data=audio_data,
-        text_data=text_data,
-        dict_size=data_generator.vocab_size,
-        num_conv_layers=args.num_conv_layers,
-        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size,
-        is_inference=True)
-
-    # load parameters
-    parameters = paddle.parameters.Parameters.from_tar(
-        gzip.open(args.model_filepath))
-
-    # prepare infer data
+        specgram_type=args.specgram_type,
+        num_threads=1)
     batch_reader = data_generator.batch_reader_creator(
-        manifest_path=args.decode_manifest_path,
+        manifest_path=args.infer_manifest,
         batch_size=args.num_samples,
         min_batch_size=1,
         sortagrad=False,
         shuffle_method=None)
     infer_data = batch_reader().next()
 
-    # run inference
-    infer_results = paddle.infer(
-        output_layer=output_probs, parameters=parameters, input=infer_data)
-    num_steps = len(infer_results) // len(infer_data)
-    probs_split = [
-        infer_results[i * num_steps:(i + 1) * num_steps]
-        for i in xrange(len(infer_data))
-    ]
+    ds2_model = DeepSpeech2Model(
+        vocab_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_layer_size=args.rnn_layer_size,
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
+    result_transcripts = ds2_model.infer_batch(
+        infer_data=infer_data,
+        decoding_method=args.decoding_method,
+        beam_alpha=args.alpha,
+        beam_beta=args.beta,
+        beam_size=args.beam_size,
+        cutoff_prob=args.cutoff_prob,
+        vocab_list=data_generator.vocab_list,
+        language_model_path=args.lang_model_path,
+        num_processes=args.num_proc_bsearch)
 
-    # targe transcription
-    target_transcription = [
-        ''.join(
-            [data_generator.vocab_list[index] for index in infer_data[i][1]])
-        for i, probs in enumerate(probs_split)
+    error_rate_func = cer if args.error_rate_type == 'cer' else wer
+    target_transcripts = [
+        ''.join([data_generator.vocab_list[token] for token in transcript])
+        for _, transcript in infer_data
     ]
-
-    ## decode and print
-    # best path decode
-    wer_sum, wer_counter = 0, 0
-    if args.decode_method == "best_path":
-        for i, probs in enumerate(probs_split):
-            best_path_transcription = ctc_best_path_decoder(
-                probs_seq=probs, vocabulary=data_generator.vocab_list)
-            print("\nTarget Transcription: %s\nOutput Transcription: %s" %
-                  (target_transcription[i], best_path_transcription))
-            wer_cur = wer(target_transcription[i], best_path_transcription)
-            wer_sum += wer_cur
-            wer_counter += 1
-            print("cur wer = %f, average wer = %f" %
-                  (wer_cur, wer_sum / wer_counter))
-    # beam search decode
-    elif args.decode_method == "beam_search":
-        ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path)
-        beam_search_batch_results = ctc_beam_search_decoder_batch(
-            probs_split=probs_split,
-            vocabulary=data_generator.vocab_list,
-            beam_size=args.beam_size,
-            blank_id=len(data_generator.vocab_list),
-            num_processes=args.num_processes_beam_search,
-            cutoff_prob=args.cutoff_prob,
-            ext_scoring_func=ext_scorer, )
-        for i, beam_search_result in enumerate(beam_search_batch_results):
-            print("\nTarget Transcription:\t%s" % target_transcription[i])
-            for index in xrange(args.num_results_per_sample):
-                result = beam_search_result[index]
-                #output: index, log prob, beam result
-                print("Beam %d: %f \t%s" % (index, result[0], result[1]))
-            wer_cur = wer(target_transcription[i], beam_search_result[0][1])
-            wer_sum += wer_cur
-            wer_counter += 1
-            print("cur wer = %f , average wer = %f" %
-                  (wer_cur, wer_sum / wer_counter))
-    else:
-        raise ValueError("Decoding method [%s] is not supported." %
-                         decode_method)
+    for target, result in zip(target_transcripts, result_transcripts):
+        print("\nTarget Transcription: %s\nOutput Transcription: %s" %
+              (target, result))
+        print("Current error rate [%s] = %f" %
+              (args.error_rate_type, error_rate_func(target, result)))
 
 
 def main():
-    utils.print_arguments(args)
-    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
+    print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
     infer()
 
 
diff --git a/deep_speech_2/model.py b/deep_speech_2/model.py
deleted file mode 100644
index cb0b4ecbba1a3fb435a5f625a54d6e5bebe689e0..0000000000000000000000000000000000000000
--- a/deep_speech_2/model.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""Contains DeepSpeech2 model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.v2 as paddle
-
-
-def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
-                  padding, act):
-    """
-    Convolution layer with batch normalization.
-    """
-    conv_layer = paddle.layer.img_conv(
-        input=input,
-        filter_size=filter_size,
-        num_channels=num_channels_in,
-        num_filters=num_channels_out,
-        stride=stride,
-        padding=padding,
-        act=paddle.activation.Linear(),
-        bias_attr=False)
-    return paddle.layer.batch_norm(input=conv_layer, act=act)
-
-
-def bidirectional_simple_rnn_bn_layer(name, input, size, act):
-    """
-    Bidirectonal simple rnn layer with sequence-wise batch normalization.
-    The batch normalization is only performed on input-state weights.
-    """
-    # input-hidden weights shared across bi-direcitonal rnn.
-    input_proj = paddle.layer.fc(
-        input=input, size=size, act=paddle.activation.Linear(), bias_attr=False)
-    # batch norm is only performed on input-state projection 
-    input_proj_bn = paddle.layer.batch_norm(
-        input=input_proj, act=paddle.activation.Linear())
-    # forward and backward in time
-    forward_simple_rnn = paddle.layer.recurrent(
-        input=input_proj_bn, act=act, reverse=False)
-    backward_simple_rnn = paddle.layer.recurrent(
-        input=input_proj_bn, act=act, reverse=True)
-    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
-
-
-def conv_group(input, num_stacks):
-    """
-    Convolution group with several stacking convolution layers.
-    """
-    conv = conv_bn_layer(
-        input=input,
-        filter_size=(11, 41),
-        num_channels_in=1,
-        num_channels_out=32,
-        stride=(3, 2),
-        padding=(5, 20),
-        act=paddle.activation.BRelu())
-    for i in xrange(num_stacks - 1):
-        conv = conv_bn_layer(
-            input=conv,
-            filter_size=(11, 21),
-            num_channels_in=32,
-            num_channels_out=32,
-            stride=(1, 2),
-            padding=(5, 10),
-            act=paddle.activation.BRelu())
-    output_num_channels = 32
-    output_height = 160 // pow(2, num_stacks) + 1
-    return conv, output_num_channels, output_height
-
-
-def rnn_group(input, size, num_stacks):
-    """
-    RNN group with several stacking RNN layers.
-    """
-    output = input
-    for i in xrange(num_stacks):
-        output = bidirectional_simple_rnn_bn_layer(
-            name=str(i), input=output, size=size, act=paddle.activation.BRelu())
-    return output
-
-
-def deep_speech2(audio_data,
-                 text_data,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=256,
-                 is_inference=False):
-    """
-    The whole DeepSpeech2 model structure (a simplified version).
-
-    :param audio_data: Audio spectrogram data layer.
-    :type audio_data: LayerOutput
-    :param text_data: Transcription text data layer.
-    :type text_data: LayerOutput
-    :param dict_size: Dictionary size for tokenized transcription.
-    :type dict_size: int
-    :param num_conv_layers: Number of stacking convolution layers.
-    :type num_conv_layers: int
-    :param num_rnn_layers: Number of stacking RNN layers.
-    :type num_rnn_layers: int
-    :param rnn_size: RNN layer size (number of RNN cells).
-    :type rnn_size: int
-    :param is_inference: False in the training mode, and True in the
-                         inferene mode.
-    :type is_inference: bool
-    :return: If is_inference set False, return a ctc cost layer;
-             if is_inference set True, return a sequence layer of output
-             probability distribution.
-    :rtype: tuple of LayerOutput
-    """
-    # convolution group
-    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
-        input=audio_data, num_stacks=num_conv_layers)
-    # convert data form convolution feature map to sequence of vectors
-    conv2seq = paddle.layer.block_expand(
-        input=conv_group_output,
-        num_channels=conv_group_num_channels,
-        stride_x=1,
-        stride_y=1,
-        block_x=1,
-        block_y=conv_group_height)
-    # rnn group
-    rnn_group_output = rnn_group(
-        input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers)
-    fc = paddle.layer.fc(
-        input=rnn_group_output,
-        size=dict_size + 1,
-        act=paddle.activation.Linear(),
-        bias_attr=True)
-    if is_inference:
-        # probability distribution with softmax
-        return paddle.layer.mixed(
-            input=paddle.layer.identity_projection(input=fc),
-            act=paddle.activation.Softmax())
-    else:
-        # ctc cost
-        return paddle.layer.warp_ctc(
-            input=fc,
-            label=text_data,
-            size=dict_size + 1,
-            blank=dict_size,
-            norm_by_times=True)
diff --git a/deep_speech_2/models/__init__.py b/deep_speech_2/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deep_speech_2/decoder.py b/deep_speech_2/models/decoder.py
similarity index 95%
rename from deep_speech_2/decoder.py
rename to deep_speech_2/models/decoder.py
index a1fadc2c81ac5036f5082e1a60b018106ab90277..61ead25c8d46f8a362b8d72d88dd80aac5824088 100644
--- a/deep_speech_2/decoder.py
+++ b/deep_speech_2/models/decoder.py
@@ -9,8 +9,9 @@ from math import log
 import multiprocessing
 
 
-def ctc_best_path_decoder(probs_seq, vocabulary):
-    """Best path decoder, also called argmax decoder or greedy decoder.
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """CTC greedy (best path) decoder.
+
     Path consisting of the most probable tokens are further post-processed to
     remove consecutive repetitions and all blanks.
 
@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
                             cutoff_prob=1.0,
                             ext_scoring_func=None,
                             nproc=False):
-    """Beam search decoder for CTC-trained network. It utilizes beam search
-    to approximately select top best decoding labels and returning results
-    in the descending order. The implementation is based on Prefix
-    Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is
+    """CTC Beam search decoder.
+
+    It utilizes beam search to approximately select top best decoding
+    labels and returning results in the descending order.
+    The implementation is based on Prefix Beam Search
+    (https://arxiv.org/abs/1408.2873), and the unclear part is
     redesigned. Two important modifications: 1) in the iterative computation
     of probabilities, the assignment operation is changed to accumulation for
     one prefix may comes from different paths; 2) the if condition "if l^+ not
@@ -205,9 +208,9 @@ def ctc_beam_search_decoder_batch(probs_split,
     :type num_processes: int
     :param cutoff_prob: Cutoff probability in pruning,
                         default 1.0, no pruning.
+    :type cutoff_prob: float
     :param num_processes: Number of parallel processes.
     :type num_processes: int
-    :type cutoff_prob: float
     :param ext_scoring_func: External scoring function for
                             partially decoded sentence, e.g. word count
                             or language model.
diff --git a/deep_speech_2/models/model.py b/deep_speech_2/models/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c072a5f69c972c9691e243f03a8902b3adca9c7d
--- /dev/null
+++ b/deep_speech_2/models/model.py
@@ -0,0 +1,253 @@
+"""Contains DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os
+import time
+import gzip
+import paddle.v2 as paddle
+from lm.lm_scorer import LmScorer
+from models.decoder import ctc_greedy_decoder, ctc_beam_search_decoder
+from models.decoder import ctc_beam_search_decoder_batch
+from models.network import deep_speech_v2_network
+
+
+class DeepSpeech2Model(object):
+    """DeepSpeech2Model class.
+
+    :param vocab_size: Decoding vocabulary size.
+    :type vocab_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_layer_size: RNN layer size (number of RNN cells).
+    :type rnn_layer_size: int
+    :param pretrained_model_path: Pretrained model path. If None, will train
+                                  from stratch.
+    :type pretrained_model_path: basestring|None
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward directional RNNs.Notice that
+                              for GRU, weight sharing is not supported.
+    :type share_rnn_weights: bool
+    """
+
+    def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
+                 rnn_layer_size, use_gru, pretrained_model_path,
+                 share_rnn_weights):
+        self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
+                             rnn_layer_size, use_gru, share_rnn_weights)
+        self._create_parameters(pretrained_model_path)
+        self._inferer = None
+        self._loss_inferer = None
+        self._ext_scorer = None
+
+    def train(self,
+              train_batch_reader,
+              dev_batch_reader,
+              feeding_dict,
+              learning_rate,
+              gradient_clipping,
+              num_passes,
+              output_model_dir,
+              is_local=True,
+              num_iterations_print=100):
+        """Train the model.
+
+        :param train_batch_reader: Train data reader.
+        :type train_batch_reader: callable
+        :param dev_batch_reader: Validation data reader.
+        :type dev_batch_reader: callable
+        :param feeding_dict: Feeding is a map of field name and tuple index
+                             of the data that reader returns.
+        :type feeding_dict: dict|list
+        :param learning_rate: Learning rate for ADAM optimizer.
+        :type learning_rate: float
+        :param gradient_clipping: Gradient clipping threshold.
+        :type gradient_clipping: float
+        :param num_passes: Number of training epochs.
+        :type num_passes: int
+        :param num_iterations_print: Number of training iterations for printing
+                                     a training loss.
+        :type rnn_iteratons_print: int
+        :param is_local: Set to False if running with pserver with multi-nodes.
+        :type is_local: bool
+        :param output_model_dir: Directory for saving the model (every pass).
+        :type output_model_dir: basestring
+        """
+        # prepare model output directory
+        if not os.path.exists(output_model_dir):
+            os.mkdir(output_model_dir)
+
+        # prepare optimizer and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=learning_rate,
+            gradient_clipping_threshold=gradient_clipping)
+        trainer = paddle.trainer.SGD(
+            cost=self._loss,
+            parameters=self._parameters,
+            update_equation=optimizer,
+            is_local=is_local)
+
+        # create event handler
+        def event_handler(event):
+            global start_time, cost_sum, cost_counter
+            if isinstance(event, paddle.event.EndIteration):
+                cost_sum += event.cost
+                cost_counter += 1
+                if (event.batch_id + 1) % num_iterations_print == 0:
+                    output_model_path = os.path.join(output_model_dir,
+                                                     "params.latest.tar.gz")
+                    with gzip.open(output_model_path, 'w') as f:
+                        self._parameters.to_tar(f)
+                    print("\nPass: %d, Batch: %d, TrainCost: %f" %
+                          (event.pass_id, event.batch_id + 1,
+                           cost_sum / cost_counter))
+                    cost_sum, cost_counter = 0.0, 0
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+            if isinstance(event, paddle.event.BeginPass):
+                start_time = time.time()
+                cost_sum, cost_counter = 0.0, 0
+            if isinstance(event, paddle.event.EndPass):
+                result = trainer.test(
+                    reader=dev_batch_reader, feeding=feeding_dict)
+                output_model_path = os.path.join(
+                    output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
+                with gzip.open(output_model_path, 'w') as f:
+                    self._parameters.to_tar(f)
+                print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
+                      (time.time() - start_time, event.pass_id, result.cost))
+
+        # run train
+        trainer.train(
+            reader=train_batch_reader,
+            event_handler=event_handler,
+            num_passes=num_passes,
+            feeding=feeding_dict)
+
+    def infer_loss_batch(self, infer_data):
+        """Model inference. Infer the ctc loss for a batch of speech
+        utterances.
+
+        :param infer_data: List of utterances to infer, with each utterance a
+                           tuple of audio features and transcription text (empty
+                           string).
+        :type infer_data: list
+        :return: List of ctc loss.
+        :rtype: List of float
+        """
+        # define inferer
+        if self._loss_inferer == None:
+            self._loss_inferer = paddle.inference.Inference(
+                output_layer=self._loss, parameters=self._parameters)
+        # run inference
+        return self._loss_inferer.infer(input=infer_data)
+
+    def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
+                    beam_size, cutoff_prob, vocab_list, language_model_path,
+                    num_processes):
+        """Model inference. Infer the transcription for a batch of speech
+        utterances.
+
+        :param infer_data: List of utterances to infer, with each utterance
+                           consisting of a tuple of audio features and
+                           transcription text (empty string).
+        :type infer_data: list
+        :param decoding_method: Decoding method name, 'ctc_greedy' or
+                                'ctc_beam_search'.
+        :param decoding_method: string
+        :param beam_alpha: Parameter associated with language model.
+        :type beam_alpha: float
+        :param beam_beta: Parameter associated with word count.
+        :type beam_beta: float
+        :param beam_size: Width for Beam search.
+        :type beam_size: int
+        :param cutoff_prob: Cutoff probability in pruning,
+                            default 1.0, no pruning.
+        :type cutoff_prob: float
+        :param vocab_list: List of tokens in the vocabulary, for decoding.
+        :type vocab_list: list
+        :param language_model_path: Filepath for language model.
+        :type language_model_path: basestring|None
+        :param num_processes: Number of processes (CPU) for decoder.
+        :type num_processes: int
+        :return: List of transcription texts.
+        :rtype: List of basestring
+        """
+        # define inferer
+        if self._inferer == None:
+            self._inferer = paddle.inference.Inference(
+                output_layer=self._log_probs, parameters=self._parameters)
+        # run inference
+        infer_results = self._inferer.infer(input=infer_data)
+        num_steps = len(infer_results) // len(infer_data)
+        probs_split = [
+            infer_results[i * num_steps:(i + 1) * num_steps]
+            for i in xrange(0, len(infer_data))
+        ]
+        # run decoder
+        results = []
+        if decoding_method == "ctc_greedy":
+            # best path decode
+            for i, probs in enumerate(probs_split):
+                output_transcription = ctc_greedy_decoder(
+                    probs_seq=probs, vocabulary=vocab_list)
+                results.append(output_transcription)
+        elif decoding_method == "ctc_beam_search":
+            # initialize external scorer
+            if self._ext_scorer == None:
+                self._ext_scorer = LmScorer(beam_alpha, beam_beta,
+                                            language_model_path)
+                self._loaded_lm_path = language_model_path
+            else:
+                self._ext_scorer.reset_params(beam_alpha, beam_beta)
+                assert self._loaded_lm_path == language_model_path
+            # beam search decode
+            beam_search_results = ctc_beam_search_decoder_batch(
+                probs_split=probs_split,
+                vocabulary=vocab_list,
+                beam_size=beam_size,
+                blank_id=len(vocab_list),
+                num_processes=num_processes,
+                ext_scoring_func=self._ext_scorer,
+                cutoff_prob=cutoff_prob)
+
+            results = [result[0][1] for result in beam_search_results]
+        else:
+            raise ValueError("Decoding method [%s] is not supported." %
+                             decoding_method)
+        return results
+
+    def _create_parameters(self, model_path=None):
+        """Load or create model parameters."""
+        if model_path is None:
+            self._parameters = paddle.parameters.create(self._loss)
+        else:
+            self._parameters = paddle.parameters.Parameters.from_tar(
+                gzip.open(model_path))
+
+    def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
+                        rnn_layer_size, use_gru, share_rnn_weights):
+        """Create data layers and model network."""
+        # paddle.data_type.dense_array is used for variable batch input.
+        # The size 161 * 161 is only an placeholder value and the real shape
+        # of input batch data will be induced during training.
+        audio_data = paddle.layer.data(
+            name="audio_spectrogram",
+            type=paddle.data_type.dense_array(161 * 161))
+        text_data = paddle.layer.data(
+            name="transcript_text",
+            type=paddle.data_type.integer_value_sequence(vocab_size))
+        self._log_probs, self._loss = deep_speech_v2_network(
+            audio_data=audio_data,
+            text_data=text_data,
+            dict_size=vocab_size,
+            num_conv_layers=num_conv_layers,
+            num_rnn_layers=num_rnn_layers,
+            rnn_size=rnn_layer_size,
+            use_gru=use_gru,
+            share_rnn_weights=share_rnn_weights)
diff --git a/deep_speech_2/models/network.py b/deep_speech_2/models/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..13ba5d2c927116150ec15f5604b9576ee90d4200
--- /dev/null
+++ b/deep_speech_2/models/network.py
@@ -0,0 +1,274 @@
+"""Contains DeepSpeech2 layers and networks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.v2 as paddle
+
+
+def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
+                  padding, act):
+    """Convolution layer with batch normalization.
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
+                        two image dimension.
+    :type filter_size: int|tuple|list
+    :param num_channels_in: Number of input channels.
+    :type num_channels_in: int
+    :type num_channels_out: Number of output channels.
+    :type num_channels_in: out
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension.
+    :type padding: int|tuple|list
+    :param act: Activation type.
+    :type act: BaseActivation
+    :return: Batch norm layer after convolution layer.
+    :rtype: LayerOutput
+    """
+    conv_layer = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=num_channels_in,
+        num_filters=num_channels_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=conv_layer, act=act)
+
+
+def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
+    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+
+    :param name: Name of the layer.
+    :type name: string
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param size: Number of RNN cells.
+    :type size: int
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param share_weights: Whether to share input-hidden weights between
+                          forward and backward directional RNNs.
+    :type share_weights: bool
+    :return: Bidirectional simple rnn layer.
+    :rtype: LayerOutput
+    """
+    if share_weights:
+        # input-hidden weights shared between bi-direcitonal rnn.
+        input_proj = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        # batch norm is only performed on input-state projection
+        input_proj_bn = paddle.layer.batch_norm(
+            input=input_proj, act=paddle.activation.Linear())
+        # forward and backward in time
+        forward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn, act=act, reverse=False)
+        backward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn, act=act, reverse=True)
+
+    else:
+        input_proj_forward = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        input_proj_backward = paddle.layer.fc(
+            input=input,
+            size=size,
+            act=paddle.activation.Linear(),
+            bias_attr=False)
+        # batch norm is only performed on input-state projection
+        input_proj_bn_forward = paddle.layer.batch_norm(
+            input=input_proj_forward, act=paddle.activation.Linear())
+        input_proj_bn_backward = paddle.layer.batch_norm(
+            input=input_proj_backward, act=paddle.activation.Linear())
+        # forward and backward in time
+        forward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn_forward, act=act, reverse=False)
+        backward_simple_rnn = paddle.layer.recurrent(
+            input=input_proj_bn_backward, act=act, reverse=True)
+
+    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])
+
+
+def bidirectional_gru_bn_layer(name, input, size, act):
+    """Bidirectonal gru layer with sequence-wise batch normalization.
+    The batch normalization is only performed on input-state weights.
+
+    :param name: Name of the layer.
+    :type name: string
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param size: Number of RNN cells.
+    :type size: int
+    :param act: Activation type.
+    :type act: BaseActivation
+    :return: Bidirectional simple rnn layer.
+    :rtype: LayerOutput
+    """
+    input_proj_forward = paddle.layer.fc(
+        input=input,
+        size=size * 3,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    input_proj_backward = paddle.layer.fc(
+        input=input,
+        size=size * 3,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    # batch norm is only performed on input-related projections
+    input_proj_bn_forward = paddle.layer.batch_norm(
+        input=input_proj_forward, act=paddle.activation.Linear())
+    input_proj_bn_backward = paddle.layer.batch_norm(
+        input=input_proj_backward, act=paddle.activation.Linear())
+    # forward and backward in time
+    forward_gru = paddle.layer.grumemory(
+        input=input_proj_bn_forward, act=act, reverse=False)
+    backward_gru = paddle.layer.grumemory(
+        input=input_proj_bn_backward, act=act, reverse=True)
+    return paddle.layer.concat(input=[forward_gru, backward_gru])
+
+
+def conv_group(input, num_stacks):
+    """Convolution group with stacked convolution layers.
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param num_stacks: Number of stacked convolution layers.
+    :type num_stacks: int
+    :return: Output layer of the convolution group.
+    :rtype: LayerOutput
+    """
+    conv = conv_bn_layer(
+        input=input,
+        filter_size=(11, 41),
+        num_channels_in=1,
+        num_channels_out=32,
+        stride=(3, 2),
+        padding=(5, 20),
+        act=paddle.activation.BRelu())
+    for i in xrange(num_stacks - 1):
+        conv = conv_bn_layer(
+            input=conv,
+            filter_size=(11, 21),
+            num_channels_in=32,
+            num_channels_out=32,
+            stride=(1, 2),
+            padding=(5, 10),
+            act=paddle.activation.BRelu())
+    output_num_channels = 32
+    output_height = 160 // pow(2, num_stacks) + 1
+    return conv, output_num_channels, output_height
+
+
+def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
+    """RNN group with stacked bidirectional simple RNN layers.
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param size: Number of RNN cells in each layer.
+    :type size: int
+    :param num_stacks: Number of stacked rnn layers.
+    :type num_stacks: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward directional RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
+    :return: Output layer of the RNN group.
+    :rtype: LayerOutput
+    """
+    output = input
+    for i in xrange(num_stacks):
+        if use_gru:
+            output = bidirectional_gru_bn_layer(
+                name=str(i),
+                input=output,
+                size=size,
+                act=paddle.activation.Relu())
+            # BRelu does not support hppl, need to add later. Use Relu instead.
+        else:
+            output = bidirectional_simple_rnn_bn_layer(
+                name=str(i),
+                input=output,
+                size=size,
+                act=paddle.activation.BRelu(),
+                share_weights=share_rnn_weights)
+    return output
+
+
+def deep_speech_v2_network(audio_data,
+                           text_data,
+                           dict_size,
+                           num_conv_layers=2,
+                           num_rnn_layers=3,
+                           rnn_size=256,
+                           use_gru=False,
+                           share_rnn_weights=True):
+    """The DeepSpeech2 network structure.
+
+    :param audio_data: Audio spectrogram data layer.
+    :type audio_data: LayerOutput
+    :param text_data: Transcription text data layer.
+    :type text_data: LayerOutput
+    :param dict_size: Dictionary size for tokenized transcription.
+    :type dict_size: int
+    :param num_conv_layers: Number of stacking convolution layers.
+    :type num_conv_layers: int
+    :param num_rnn_layers: Number of stacking RNN layers.
+    :type num_rnn_layers: int
+    :param rnn_size: RNN layer size (number of RNN cells).
+    :type rnn_size: int
+    :param use_gru: Use gru if set True. Use simple rnn if set False.
+    :type use_gru: bool
+    :param share_rnn_weights: Whether to share input-hidden weights between
+                              forward and backward direction RNNs.
+                              It is only available when use_gru=False.
+    :type share_weights: bool
+    :return: A tuple of an output unnormalized log probability layer (
+             before softmax) and a ctc cost layer.
+    :rtype: tuple of LayerOutput
+    """
+    # convolution group
+    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
+        input=audio_data, num_stacks=num_conv_layers)
+    # convert data form convolution feature map to sequence of vectors
+    conv2seq = paddle.layer.block_expand(
+        input=conv_group_output,
+        num_channels=conv_group_num_channels,
+        stride_x=1,
+        stride_y=1,
+        block_x=1,
+        block_y=conv_group_height)
+    # rnn group
+    rnn_group_output = rnn_group(
+        input=conv2seq,
+        size=rnn_size,
+        num_stacks=num_rnn_layers,
+        use_gru=use_gru,
+        share_rnn_weights=share_rnn_weights)
+    fc = paddle.layer.fc(
+        input=rnn_group_output,
+        size=dict_size + 1,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+    # probability distribution with softmax
+    log_probs = paddle.layer.mixed(
+        input=paddle.layer.identity_projection(input=fc),
+        act=paddle.activation.Softmax())
+    # ctc cost
+    ctc_loss = paddle.layer.warp_ctc(
+        input=fc,
+        label=text_data,
+        size=dict_size + 1,
+        blank=dict_size,
+        norm_by_times=True)
+    return log_probs, ctc_loss
diff --git a/deep_speech_2/tests/test_decoders.py b/deep_speech_2/models/tests/test_decoders.py
similarity index 80%
rename from deep_speech_2/tests/test_decoders.py
rename to deep_speech_2/models/tests/test_decoders.py
index 99d8a8289d93574c58ced50923716c39cfb96558..acce46af81c0168903fa57d5d756dcfd911aa15f 100644
--- a/deep_speech_2/tests/test_decoders.py
+++ b/deep_speech_2/models/tests/test_decoders.py
@@ -4,7 +4,7 @@ from __future__ import division
 from __future__ import print_function
 
 import unittest
-from decoder import *
+from models import decoder
 
 
 class TestDecoders(unittest.TestCase):
@@ -49,19 +49,21 @@ class TestDecoders(unittest.TestCase):
             0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
             0.05294827, 0.22298418
         ]]
-        self.best_path_result = ["ac'bdc", "b'da"]
+        self.greedy_result = ["ac'bdc", "b'da"]
         self.beam_search_result = ['acdc', "b'a"]
 
-    def test_best_path_decoder_1(self):
-        bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list)
-        self.assertEqual(bst_result, self.best_path_result[0])
+    def test_greedy_decoder_1(self):
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq1,
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[0])
 
-    def test_best_path_decoder_2(self):
-        bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list)
-        self.assertEqual(bst_result, self.best_path_result[1])
+    def test_greedy_decoder_2(self):
+        bst_result = decoder.ctc_greedy_decoder(self.probs_seq2,
+                                                self.vocab_list)
+        self.assertEqual(bst_result, self.greedy_result[1])
 
     def test_beam_search_decoder_1(self):
-        beam_result = ctc_beam_search_decoder(
+        beam_result = decoder.ctc_beam_search_decoder(
             probs_seq=self.probs_seq1,
             beam_size=self.beam_size,
             vocabulary=self.vocab_list,
@@ -69,7 +71,7 @@ class TestDecoders(unittest.TestCase):
         self.assertEqual(beam_result[0][1], self.beam_search_result[0])
 
     def test_beam_search_decoder_2(self):
-        beam_result = ctc_beam_search_decoder(
+        beam_result = decoder.ctc_beam_search_decoder(
             probs_seq=self.probs_seq2,
             beam_size=self.beam_size,
             vocabulary=self.vocab_list,
@@ -77,7 +79,7 @@ class TestDecoders(unittest.TestCase):
         self.assertEqual(beam_result[0][1], self.beam_search_result[1])
 
     def test_beam_search_decoder_batch(self):
-        beam_results = ctc_beam_search_decoder_batch(
+        beam_results = decoder.ctc_beam_search_decoder_batch(
             probs_split=[self.probs_seq1, self.probs_seq2],
             beam_size=self.beam_size,
             vocabulary=self.vocab_list,
diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt
old mode 100755
new mode 100644
index 2ae7d0895a3594059e995e20d106f7c30ef92568..131f75ff47e003f3b44f4a62f1431cf13d4f44a4
--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
@@ -1,4 +1,5 @@
-wget==3.2
 scipy==0.13.1
 resampy==0.1.5
-https://github.com/kpu/kenlm/archive/master.zip
+SoundFile==0.9.0.post1
+python_speech_features
+https://github.com/luotao1/kenlm/archive/master.zip
diff --git a/deep_speech_2/setup.sh b/deep_speech_2/setup.sh
index 8cba91ecdb68b42125181331471f9ee323062a24..6c8a709941ae94124149482f1886bf445c170af8 100644
--- a/deep_speech_2/setup.sh
+++ b/deep_speech_2/setup.sh
@@ -9,25 +9,21 @@ if [ $? != 0 ]; then
     exit 1
 fi
 
-# install package Soundfile
-curl -O "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+# install package libsndfile
+python -c "import soundfile"
 if [ $? != 0 ]; then
-    echo "Download libsndfile-1.0.28.tar.gz failed !!!"
-    exit 1
+    echo "Install package libsndfile into default system path."
+    wget "http://www.mega-nerd.com/libsndfile/files/libsndfile-1.0.28.tar.gz"
+    if [ $? != 0 ]; then
+        echo "Download libsndfile-1.0.28.tar.gz failed !!!"
+        exit 1
+    fi
+    tar -zxvf libsndfile-1.0.28.tar.gz
+    cd libsndfile-1.0.28
+    ./configure && make && make install
+    cd ..
+    rm -rf libsndfile-1.0.28
+    rm libsndfile-1.0.28.tar.gz
 fi
-tar -zxvf libsndfile-1.0.28.tar.gz
-cd libsndfile-1.0.28
-./configure && make && make install
-cd -
-rm -rf libsndfile-1.0.28
-rm libsndfile-1.0.28.tar.gz
-pip install SoundFile==0.9.0.post1
-if [ $? != 0 ]; then
-    echo "Install SoundFile failed !!!"
-    exit 1
-fi
-
-# prepare ./checkpoints
-mkdir checkpoints
 
 echo "Install all dependencies successfully."
diff --git a/deep_speech_2/test.py b/deep_speech_2/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..747e40df872cd3f9e0844ed7ad82b2f9cfecf196
--- /dev/null
+++ b/deep_speech_2/test.py
@@ -0,0 +1,121 @@
+"""Evaluation for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import paddle.v2 as paddle
+from data_utils.data import DataGenerator
+from models.model import DeepSpeech2Model
+from utils.error_rate import wer, cer
+from utils.utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,    128,    "Minibatch size.")
+add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
+add_arg('beam_size',        int,    500,    "Beam search width.")
+add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
+add_arg('num_proc_data',    int,    12,     "# of CPUs for data preprocessing.")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
+add_arg('alpha',            float,  0.36,   "Coef of LM for beam search.")
+add_arg('beta',             float,  0.25,   "Coef of WC for beam search.")
+add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
+                                            "bi-directional RNNs. Not for GRU.")
+add_arg('test_manifest',   str,
+        'data/librispeech/manifest.test-clean',
+        "Filepath of manifest to evaluate.")
+add_arg('mean_std_path',    str,
+        'data/librispeech/mean_std.npz',
+        "Filepath of normalizer's mean & std.")
+add_arg('vocab_path',       str,
+        'data/librispeech/eng_vocab.txt',
+        "Filepath of vocabulary.")
+add_arg('model_path',       str,
+        './checkpoints/params.latest.tar.gz',
+        "If None, the training starts from scratch, "
+        "otherwise, it resumes from the pre-trained model.")
+add_arg('lang_model_path',  str,
+        'lm/data/common_crawl_00.prune01111.trie.klm',
+        "Filepath for language model.")
+add_arg('decoding_method',  str,
+        'ctc_beam_search',
+        "Decoding method. Options: ctc_beam_search, ctc_greedy",
+        choices = ['ctc_beam_search', 'ctc_greedy'])
+add_arg('error_rate_type',  str,
+        'wer',
+        "Error rate type for evaluation.",
+        choices=['wer', 'cer'])
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+# yapf: disable
+args = parser.parse_args()
+
+
+def evaluate():
+    """Evaluate on whole test data for DeepSpeech2."""
+    data_generator = DataGenerator(
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
+        augmentation_config='{}',
+        specgram_type=args.specgram_type,
+        num_threads=args.num_proc_data)
+    batch_reader = data_generator.batch_reader_creator(
+        manifest_path=args.test_manifest,
+        batch_size=args.batch_size,
+        min_batch_size=1,
+        sortagrad=False,
+        shuffle_method=None)
+
+    ds2_model = DeepSpeech2Model(
+        vocab_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_layer_size=args.rnn_layer_size,
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
+
+    error_rate_func = cer if args.error_rate_type == 'cer' else wer
+    error_sum, num_ins = 0.0, 0
+    for infer_data in batch_reader():
+        result_transcripts = ds2_model.infer_batch(
+            infer_data=infer_data,
+            decoding_method=args.decoding_method,
+            beam_alpha=args.alpha,
+            beam_beta=args.beta,
+            beam_size=args.beam_size,
+            cutoff_prob=args.cutoff_prob,
+            vocab_list=data_generator.vocab_list,
+            language_model_path=args.lang_model_path,
+            num_processes=args.num_proc_bsearch)
+        target_transcripts = [
+            ''.join([data_generator.vocab_list[token] for token in transcript])
+            for _, transcript in infer_data
+        ]
+        for target, result in zip(target_transcripts, result_transcripts):
+            error_sum += error_rate_func(target, result)
+            num_ins += 1
+        print("Error rate [%s] (%d/?) = %f" %
+              (args.error_rate_type, num_ins, error_sum / num_ins))
+    print("Final error rate [%s] (%d/%d) = %f" %
+          (args.error_rate_type, num_ins, num_ins, error_sum / num_ins))
+
+
+def main():
+    print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+    evaluate()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/tests/test_error_rate.py b/deep_speech_2/tests/test_error_rate.py
deleted file mode 100644
index be7313f3570c2633392e35f3bf38a0d02840a196..0000000000000000000000000000000000000000
--- a/deep_speech_2/tests/test_error_rate.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-"""Test error rate."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import error_rate
-
-
-class TestParse(unittest.TestCase):
-    def test_wer_1(self):
-        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
-        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night'
-        word_error_rate = error_rate.wer(ref, hyp)
-        self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
-
-    def test_wer_2(self):
-        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
-        word_error_rate = error_rate.wer(ref, ref)
-        self.assertEqual(word_error_rate, 0.0)
-
-    def test_wer_3(self):
-        ref = ' '
-        hyp = 'Hypothesis sentence'
-        with self.assertRaises(ValueError):
-            word_error_rate = error_rate.wer(ref, hyp)
-
-    def test_cer_1(self):
-        ref = 'werewolf'
-        hyp = 'weae  wolf'
-        char_error_rate = error_rate.cer(ref, hyp)
-        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
-
-    def test_cer_2(self):
-        ref = 'werewolf'
-        char_error_rate = error_rate.cer(ref, ref)
-        self.assertEqual(char_error_rate, 0.0)
-
-    def test_cer_3(self):
-        ref = u'我是中国人'
-        hyp = u'我是 美洲人'
-        char_error_rate = error_rate.cer(ref, hyp)
-        self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
-
-    def test_cer_4(self):
-        ref = u'我是中国人'
-        char_error_rate = error_rate.cer(ref, ref)
-        self.assertFalse(char_error_rate, 0.0)
-
-    def test_cer_5(self):
-        ref = ''
-        hyp = 'Hypothesis'
-        with self.assertRaises(ValueError):
-            char_error_rate = error_rate.cer(ref, hyp)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/deep_speech_2/tools/_init_paths.py b/deep_speech_2/tools/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddabb535be682d95c3c8b73003ea30eed06ca0b0
--- /dev/null
+++ b/deep_speech_2/tools/_init_paths.py
@@ -0,0 +1,19 @@
+"""Set up paths for DS2"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = os.path.dirname(__file__)
+
+# Add project path to PYTHONPATH
+proj_path = os.path.join(this_dir, '..')
+add_path(proj_path)
diff --git a/deep_speech_2/tools/build_vocab.py b/deep_speech_2/tools/build_vocab.py
new file mode 100644
index 0000000000000000000000000000000000000000..6fbb9bdfc2b9fe10cc7fdc2e642172f82bacd824
--- /dev/null
+++ b/deep_speech_2/tools/build_vocab.py
@@ -0,0 +1,58 @@
+"""Build vocabulary from manifest files.
+
+Each item in vocabulary file is a character.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import codecs
+import json
+from collections import Counter
+import os.path
+import _init_paths
+from data_utils.utility import read_manifest
+from utils.utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('count_threshold',  int,    0,  "Truncation threshold for char counts.")
+add_arg('vocab_path',       str,
+        'datasets/vocab/zh_vocab.txt',
+        "Filepath to write the vocabulary.")
+add_arg('manifest_paths',   str,
+        None,
+        "Filepaths of manifests for building vocabulary. "
+        "You can provide multiple manifest files.",
+        nargs='+',
+        required=True)
+# yapf: disable
+args = parser.parse_args()
+
+
+def count_manifest(counter, manifest_path):
+    manifest_jsons = utils.read_manifest(manifest_path)
+    for line_json in manifest_jsons:
+        for char in line_json['text']:
+            counter.update(char)
+
+
+def main():
+    print_arguments(args)
+
+    counter = Counter()
+    for manifest_path in args.manifest_paths:
+        count_manifest(counter, manifest_path)
+
+    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
+    with codecs.open(args.vocab_path, 'w', 'utf-8') as fout:
+        for char, count in count_sorted:
+            if count < args.count_threshold: break
+            fout.write(char + '\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/tools/compute_mean_std.py b/deep_speech_2/tools/compute_mean_std.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bb6be39dbc8c9b5d9b14a54b619fb4b86512bd6
--- /dev/null
+++ b/deep_speech_2/tools/compute_mean_std.py
@@ -0,0 +1,51 @@
+"""Compute mean and std for feature normalizer, and save to file."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import _init_paths
+from data_utils.normalizer import FeatureNormalizer
+from data_utils.augmentor.augmentation import AugmentationPipeline
+from data_utils.featurizer.audio_featurizer import AudioFeaturizer
+from utils.utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+add_arg('manifest_path',    str,
+        'datasets/manifest.train',
+        "Filepath of manifest to compute normalizer's mean and stddev.")
+add_arg('output_path',    str,
+        'mean_std.npz',
+        "Filepath of write mean and stddev to (.npz).")
+# yapf: disable
+args = parser.parse_args()
+
+
+def main():
+    print_arguments(args)
+
+    augmentation_pipeline = AugmentationPipeline('{}')
+    audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=args.manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=args.num_samples)
+    normalizer.write_to_file(args.output_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/tools/tune.py b/deep_speech_2/tools/tune.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a23791092cd73b5bc7ca10997f586040cfc33a0
--- /dev/null
+++ b/deep_speech_2/tools/tune.py
@@ -0,0 +1,131 @@
+"""Beam search parameters tuning for DeepSpeech2 model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import functools
+import paddle.v2 as paddle
+import _init_paths
+from data_utils.data import DataGenerator
+from models.model import DeepSpeech2Model
+from utils.error_rate import wer
+from utils.utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('num_samples',      int,    100,    "# of samples to infer.")
+add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
+add_arg('beam_size',        int,    500,    "Beam search width.")
+add_arg('num_proc_bsearch', int,    12,     "# of CPUs for beam search.")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
+add_arg('num_alphas',       int,    14,     "# of alpha candidates for tuning.")
+add_arg('num_betas',        int,    20,     "# of beta candidates for tuning.")
+add_arg('alpha_from',       float,  0.1,    "Where alpha starts tuning from.")
+add_arg('alpha_to',         float,  0.36,   "Where alpha ends tuning with.")
+add_arg('beta_from',        float,  0.05,   "Where beta starts tuning from.")
+add_arg('beta_to',          float,  1.0,    "Where beta ends tuning with.")
+add_arg('cutoff_prob',      float,  0.99,   "Cutoff probability for pruning.")
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
+                                            "bi-directional RNNs. Not for GRU.")
+add_arg('tune_manifest',    str,
+        'data/librispeech/manifest.dev',
+        "Filepath of manifest to tune.")
+add_arg('mean_std_path',    str,
+        'data/librispeech/mean_std.npz',
+        "Filepath of normalizer's mean & std.")
+add_arg('vocab_path',       str,
+        'data/librispeech/eng_vocab.txt',
+        "Filepath of vocabulary.")
+add_arg('lang_model_path',  str,
+        'lm/data/common_crawl_00.prune01111.trie.klm',
+        "Filepath for language model.")
+add_arg('model_path',       str,
+        './checkpoints/params.latest.tar.gz',
+        "If None, the training starts from scratch, "
+        "otherwise, it resumes from the pre-trained model.")
+add_arg('error_rate_type',  str,
+        'wer',
+        "Error rate type for evaluation.",
+        choices=['wer', 'cer'])
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+# yapf: disable
+args = parser.parse_args()
+
+
+def tune():
+    """Tune parameters alpha and beta on one minibatch."""
+    if not args.num_alphas >= 0:
+        raise ValueError("num_alphas must be non-negative!")
+    if not args.num_betas >= 0:
+        raise ValueError("num_betas must be non-negative!")
+
+    data_generator = DataGenerator(
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
+        augmentation_config='{}',
+        specgram_type=args.specgram_type,
+        num_threads=1)
+    batch_reader = data_generator.batch_reader_creator(
+        manifest_path=args.tune_manifest,
+        batch_size=args.num_samples,
+        sortagrad=False,
+        shuffle_method=None)
+    tune_data = batch_reader().next()
+    target_transcripts = [
+        ''.join([data_generator.vocab_list[token] for token in transcript])
+        for _, transcript in tune_data
+    ]
+
+    ds2_model = DeepSpeech2Model(
+        vocab_size=data_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_layer_size=args.rnn_layer_size,
+        use_gru=args.use_gru,
+        pretrained_model_path=args.model_path,
+        share_rnn_weights=args.share_rnn_weights)
+
+    # create grid for search
+    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
+    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
+    params_grid = [(alpha, beta) for alpha in cand_alphas
+                   for beta in cand_betas]
+
+    ## tune parameters in loop
+    for alpha, beta in params_grid:
+        result_transcripts = ds2_model.infer_batch(
+            infer_data=tune_data,
+            decoding_method='ctc_beam_search',
+            beam_alpha=alpha,
+            beam_beta=beta,
+            beam_size=args.beam_size,
+            cutoff_prob=args.cutoff_prob,
+            vocab_list=data_generator.vocab_list,
+            language_model_path=args.lang_model_path,
+            num_processes=args.num_proc_bsearch)
+        wer_sum, num_ins = 0.0, 0
+        for target, result in zip(target_transcripts, result_transcripts):
+            wer_sum += wer(target, result)
+            num_ins += 1
+        print("alpha = %f\tbeta = %f\tWER = %f" %
+              (alpha, beta, wer_sum / num_ins))
+
+
+def main():
+    print_arguments(args)
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+    tune()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py
index 3a2d0cad9ec9635c7e44e0149e426842a5e892b6..4a7a0eda2f49eb08e55a323830c9f4bc27de0ea2 100644
--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -3,221 +3,119 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import sys
-import os
 import argparse
-import gzip
-import time
-import distutils.util
-import multiprocessing
+import functools
 import paddle.v2 as paddle
-from model import deep_speech2
+from models.model import DeepSpeech2Model
 from data_utils.data import DataGenerator
-import utils
+from utils.utility import add_arguments, print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--batch_size", default=256, type=int, help="Minibatch size.")
-parser.add_argument(
-    "--num_passes",
-    default=200,
-    type=int,
-    help="Training pass number. (default: %(default)s)")
-parser.add_argument(
-    "--num_conv_layers",
-    default=2,
-    type=int,
-    help="Convolution layer number. (default: %(default)s)")
-parser.add_argument(
-    "--num_rnn_layers",
-    default=3,
-    type=int,
-    help="RNN layer number. (default: %(default)s)")
-parser.add_argument(
-    "--rnn_layer_size",
-    default=512,
-    type=int,
-    help="RNN layer cell number. (default: %(default)s)")
-parser.add_argument(
-    "--adam_learning_rate",
-    default=5e-4,
-    type=float,
-    help="Learning rate for ADAM Optimizer. (default: %(default)s)")
-parser.add_argument(
-    "--use_gpu",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use gpu or not. (default: %(default)s)")
-parser.add_argument(
-    "--use_sortagrad",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use sortagrad or not. (default: %(default)s)")
-parser.add_argument(
-    "--max_duration",
-    default=27.0,
-    type=float,
-    help="Audios with duration larger than this will be discarded. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--min_duration",
-    default=0.0,
-    type=float,
-    help="Audios with duration smaller than this will be discarded. "
-    "(default: %(default)s)")
-parser.add_argument(
-    "--shuffle_method",
-    default='batch_shuffle_clipped',
-    type=str,
-    help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
-    "'batch_shuffle_batch'. (default: %(default)s)")
-parser.add_argument(
-    "--trainer_count",
-    default=8,
-    type=int,
-    help="Trainer number. (default: %(default)s)")
-parser.add_argument(
-    "--num_threads_data",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
-parser.add_argument(
-    "--mean_std_filepath",
-    default='mean_std.npz',
-    type=str,
-    help="Manifest path for normalizer. (default: %(default)s)")
-parser.add_argument(
-    "--train_manifest_path",
-    default='datasets/manifest.train',
-    type=str,
-    help="Manifest path for training. (default: %(default)s)")
-parser.add_argument(
-    "--dev_manifest_path",
-    default='datasets/manifest.dev',
-    type=str,
-    help="Manifest path for validation. (default: %(default)s)")
-parser.add_argument(
-    "--vocab_filepath",
-    default='datasets/vocab/eng_vocab.txt',
-    type=str,
-    help="Vocabulary filepath. (default: %(default)s)")
-parser.add_argument(
-    "--init_model_path",
-    default=None,
-    type=str,
-    help="If set None, the training will start from scratch. "
-    "Otherwise, the training will resume from "
-    "the existing model of this path. (default: %(default)s)")
-parser.add_argument(
-    "--augmentation_config",
-    default='[{"type": "shift", '
-    '"params": {"min_shift_ms": -5, "max_shift_ms": 5},'
-    '"prob": 1.0}]',
-    type=str,
-    help="Augmentation configuration in json-format. "
-    "(default: %(default)s)")
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',       int,    256,    "Minibatch size.")
+add_arg('trainer_count',    int,    8,      "# of Trainers (CPUs or GPUs).")
+add_arg('num_passes',       int,    200,    "# of training epochs.")
+add_arg('num_proc_data',    int,    12,     "# of CPUs for data preprocessing.")
+add_arg('num_conv_layers',  int,    2,      "# of convolution layers.")
+add_arg('num_rnn_layers',   int,    3,      "# of recurrent layers.")
+add_arg('rnn_layer_size',   int,    2048,   "# of recurrent cells per layer.")
+add_arg('num_iter_print',   int,    100,    "Every # iterations for printing "
+                                            "train cost.")
+add_arg('learning_rate',    float,  5e-4,   "Learning rate.")
+add_arg('max_duration',     float,  27.0,   "Longest audio duration allowed.")
+add_arg('min_duration',     float,  0.0,    "Shortest audio duration allowed.")
+add_arg('use_sortagrad',    bool,   True,   "Use SortaGrad or not.")
+add_arg('use_gpu',          bool,   True,   "Use GPU or not.")
+add_arg('use_gru',          bool,   False,  "Use GRUs instead of simple RNNs.")
+add_arg('is_local',         bool,   True,   "Use pserver or not.")
+add_arg('share_rnn_weights',bool,   True,   "Share input-hidden weights across "
+                                            "bi-directional RNNs. Not for GRU.")
+add_arg('train_manifest',   str,
+        'data/librispeech/manifest.train',
+        "Filepath of train manifest.")
+add_arg('dev_manifest',     str,
+        'data/librispeech/manifest.dev-clean',
+        "Filepath of validation manifest.")
+add_arg('mean_std_path',    str,
+        'data/librispeech/mean_std.npz',
+        "Filepath of normalizer's mean & std.")
+add_arg('vocab_path',       str,
+        'data/librispeech/eng_vocab.txt',
+        "Filepath of vocabulary.")
+add_arg('init_model_path',  str,
+        None,
+        "If None, the training starts from scratch, "
+        "otherwise, it resumes from the pre-trained model.")
+add_arg('output_model_dir', str,
+        "./checkpoints",
+        "Directory for saving checkpoints.")
+add_arg('augment_conf_path',str,
+        'conf/augmentation.config',
+        "Filepath of augmentation configuration file (json-format).")
+add_arg('specgram_type',    str,
+        'linear',
+        "Audio feature type. Options: linear, mfcc.",
+        choices=['linear', 'mfcc'])
+add_arg('shuffle_method',   str,
+        'batch_shuffle_clipped',
+        "Shuffle method.",
+        choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
+# yapf: disable
 args = parser.parse_args()
 
 
 def train():
     """DeepSpeech2 training."""
-
-    # initialize data generator
-    def data_generator():
-        return DataGenerator(
-            vocab_filepath=args.vocab_filepath,
-            mean_std_filepath=args.mean_std_filepath,
-            augmentation_config=args.augmentation_config,
-            max_duration=args.max_duration,
-            min_duration=args.min_duration,
-            num_threads=args.num_threads_data)
-
-    train_generator = data_generator()
-    test_generator = data_generator()
-
-    # create network config
-    # paddle.data_type.dense_array is used for variable batch input.
-    # The size 161 * 161 is only an placeholder value and the real shape
-    # of input batch data will be induced during training.
-    audio_data = paddle.layer.data(
-        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
-    text_data = paddle.layer.data(
-        name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(
-            train_generator.vocab_size))
-    cost = deep_speech2(
-        audio_data=audio_data,
-        text_data=text_data,
-        dict_size=train_generator.vocab_size,
-        num_conv_layers=args.num_conv_layers,
-        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size,
-        is_inference=False)
-
-    # create/load parameters and optimizer
-    if args.init_model_path is None:
-        parameters = paddle.parameters.create(cost)
-    else:
-        if not os.path.isfile(args.init_model_path):
-            raise IOError("Invalid model!")
-        parameters = paddle.parameters.Parameters.from_tar(
-            gzip.open(args.init_model_path))
-    optimizer = paddle.optimizer.Adam(
-        learning_rate=args.adam_learning_rate, gradient_clipping_threshold=400)
-    trainer = paddle.trainer.SGD(
-        cost=cost, parameters=parameters, update_equation=optimizer)
-
-    # prepare data reader
+    train_generator = DataGenerator(
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
+        augmentation_config=open(args.augment_conf_path, 'r').read(),
+        max_duration=args.max_duration,
+        min_duration=args.min_duration,
+        specgram_type=args.specgram_type,
+        num_threads=args.num_proc_data)
+    dev_generator = DataGenerator(
+        vocab_filepath=args.vocab_path,
+        mean_std_filepath=args.mean_std_path,
+        augmentation_config="{}",
+        specgram_type=args.specgram_type,
+        num_threads=args.num_proc_data)
     train_batch_reader = train_generator.batch_reader_creator(
-        manifest_path=args.train_manifest_path,
+        manifest_path=args.train_manifest,
         batch_size=args.batch_size,
         min_batch_size=args.trainer_count,
         sortagrad=args.use_sortagrad if args.init_model_path is None else False,
         shuffle_method=args.shuffle_method)
-    test_batch_reader = test_generator.batch_reader_creator(
-        manifest_path=args.dev_manifest_path,
+    dev_batch_reader = dev_generator.batch_reader_creator(
+        manifest_path=args.dev_manifest,
         batch_size=args.batch_size,
         min_batch_size=1,  # must be 1, but will have errors.
         sortagrad=False,
         shuffle_method=None)
 
-    # create event handler
-    def event_handler(event):
-        global start_time, cost_sum, cost_counter
-        if isinstance(event, paddle.event.EndIteration):
-            cost_sum += event.cost
-            cost_counter += 1
-            if (event.batch_id + 1) % 100 == 0:
-                print("\nPass: %d, Batch: %d, TrainCost: %f" % (
-                    event.pass_id, event.batch_id + 1, cost_sum / cost_counter))
-                cost_sum, cost_counter = 0.0, 0
-                with gzip.open("checkpoints/params.latest.tar.gz", 'w') as f:
-                    parameters.to_tar(f)
-            else:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-        if isinstance(event, paddle.event.BeginPass):
-            start_time = time.time()
-            cost_sum, cost_counter = 0.0, 0
-        if isinstance(event, paddle.event.EndPass):
-            result = trainer.test(
-                reader=test_batch_reader, feeding=test_generator.feeding)
-            print("\n------- Time: %d sec,  Pass: %d, ValidationCost: %s" %
-                  (time.time() - start_time, event.pass_id, result.cost))
-            with gzip.open("checkpoints/params.pass-%d.tar.gz" % event.pass_id,
-                           'w') as f:
-                parameters.to_tar(f)
-
-    # run train
-    trainer.train(
-        reader=train_batch_reader,
-        event_handler=event_handler,
+    ds2_model = DeepSpeech2Model(
+        vocab_size=train_generator.vocab_size,
+        num_conv_layers=args.num_conv_layers,
+        num_rnn_layers=args.num_rnn_layers,
+        rnn_layer_size=args.rnn_layer_size,
+        use_gru=args.use_gru,
+        pretrained_model_path=args.init_model_path,
+        share_rnn_weights=args.share_rnn_weights)
+    ds2_model.train(
+        train_batch_reader=train_batch_reader,
+        dev_batch_reader=dev_batch_reader,
+        feeding_dict=train_generator.feeding,
+        learning_rate=args.learning_rate,
+        gradient_clipping=400,
         num_passes=args.num_passes,
-        feeding=train_generator.feeding)
+        num_iterations_print=args.num_iter_print,
+        output_model_dir=args.output_model_dir,
+        is_local=args.is_local)
 
 
 def main():
-    utils.print_arguments(args)
+    print_arguments(args)
     paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
     train()
 
diff --git a/deep_speech_2/tune.py b/deep_speech_2/tune.py
deleted file mode 100644
index 19a2d5595118c53751a8aa59ef63bd494375af84..0000000000000000000000000000000000000000
--- a/deep_speech_2/tune.py
+++ /dev/null
@@ -1,217 +0,0 @@
-"""Parameters tuning for DeepSpeech2 model."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import distutils.util
-import argparse
-import gzip
-import paddle.v2 as paddle
-from data_utils.data import DataGenerator
-from model import deep_speech2
-from decoder import *
-from lm.lm_scorer import LmScorer
-from error_rate import wer
-import utils
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--num_samples",
-    default=100,
-    type=int,
-    help="Number of samples for parameters tuning. (default: %(default)s)")
-parser.add_argument(
-    "--num_conv_layers",
-    default=2,
-    type=int,
-    help="Convolution layer number. (default: %(default)s)")
-parser.add_argument(
-    "--num_rnn_layers",
-    default=3,
-    type=int,
-    help="RNN layer number. (default: %(default)s)")
-parser.add_argument(
-    "--rnn_layer_size",
-    default=512,
-    type=int,
-    help="RNN layer cell number. (default: %(default)s)")
-parser.add_argument(
-    "--use_gpu",
-    default=True,
-    type=distutils.util.strtobool,
-    help="Use gpu or not. (default: %(default)s)")
-parser.add_argument(
-    "--num_threads_data",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu threads for preprocessing data. (default: %(default)s)")
-parser.add_argument(
-    "--num_processes_beam_search",
-    default=multiprocessing.cpu_count(),
-    type=int,
-    help="Number of cpu processes for beam search. (default: %(default)s)")
-parser.add_argument(
-    "--mean_std_filepath",
-    default='mean_std.npz',
-    type=str,
-    help="Manifest path for normalizer. (default: %(default)s)")
-parser.add_argument(
-    "--decode_manifest_path",
-    default='datasets/manifest.test',
-    type=str,
-    help="Manifest path for decoding. (default: %(default)s)")
-parser.add_argument(
-    "--model_filepath",
-    default='checkpoints/params.latest.tar.gz',
-    type=str,
-    help="Model filepath. (default: %(default)s)")
-parser.add_argument(
-    "--vocab_filepath",
-    default='datasets/vocab/eng_vocab.txt',
-    type=str,
-    help="Vocabulary filepath. (default: %(default)s)")
-parser.add_argument(
-    "--beam_size",
-    default=500,
-    type=int,
-    help="Width for beam search decoding. (default: %(default)d)")
-parser.add_argument(
-    "--language_model_path",
-    default="lm/data/common_crawl_00.prune01111.trie.klm",
-    type=str,
-    help="Path for language model. (default: %(default)s)")
-parser.add_argument(
-    "--alpha_from",
-    default=0.1,
-    type=float,
-    help="Where alpha starts from. (default: %(default)f)")
-parser.add_argument(
-    "--num_alphas",
-    default=14,
-    type=int,
-    help="Number of candidate alphas. (default: %(default)d)")
-parser.add_argument(
-    "--alpha_to",
-    default=0.36,
-    type=float,
-    help="Where alpha ends with. (default: %(default)f)")
-parser.add_argument(
-    "--beta_from",
-    default=0.05,
-    type=float,
-    help="Where beta starts from. (default: %(default)f)")
-parser.add_argument(
-    "--num_betas",
-    default=20,
-    type=float,
-    help="Number of candidate betas. (default: %(default)d)")
-parser.add_argument(
-    "--beta_to",
-    default=1.0,
-    type=float,
-    help="Where beta ends with. (default: %(default)f)")
-parser.add_argument(
-    "--cutoff_prob",
-    default=0.99,
-    type=float,
-    help="The cutoff probability of pruning"
-    "in beam search. (default: %(default)f)")
-args = parser.parse_args()
-
-
-def tune():
-    """Tune parameters alpha and beta on one minibatch."""
-
-    if not args.num_alphas >= 0:
-        raise ValueError("num_alphas must be non-negative!")
-
-    if not args.num_betas >= 0:
-        raise ValueError("num_betas must be non-negative!")
-
-    # initialize data generator
-    data_generator = DataGenerator(
-        vocab_filepath=args.vocab_filepath,
-        mean_std_filepath=args.mean_std_filepath,
-        augmentation_config='{}',
-        num_threads=args.num_threads_data)
-
-    # create network config
-    # paddle.data_type.dense_array is used for variable batch input.
-    # The size 161 * 161 is only an placeholder value and the real shape
-    # of input batch data will be induced during training.
-    audio_data = paddle.layer.data(
-        name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161))
-    text_data = paddle.layer.data(
-        name="transcript_text",
-        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
-    output_probs = deep_speech2(
-        audio_data=audio_data,
-        text_data=text_data,
-        dict_size=data_generator.vocab_size,
-        num_conv_layers=args.num_conv_layers,
-        num_rnn_layers=args.num_rnn_layers,
-        rnn_size=args.rnn_layer_size,
-        is_inference=True)
-
-    # load parameters
-    parameters = paddle.parameters.Parameters.from_tar(
-        gzip.open(args.model_filepath))
-
-    # prepare infer data
-    batch_reader = data_generator.batch_reader_creator(
-        manifest_path=args.decode_manifest_path,
-        batch_size=args.num_samples,
-        sortagrad=False,
-        shuffle_method=None)
-    # get one batch data for tuning
-    infer_data = batch_reader().next()
-
-    # run inference
-    infer_results = paddle.infer(
-        output_layer=output_probs, parameters=parameters, input=infer_data)
-    num_steps = len(infer_results) // len(infer_data)
-    probs_split = [
-        infer_results[i * num_steps:(i + 1) * num_steps]
-        for i in xrange(0, len(infer_data))
-    ]
-
-    # create grid for search
-    cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas)
-    cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas)
-    params_grid = [(alpha, beta) for alpha in cand_alphas
-                   for beta in cand_betas]
-
-    ext_scorer = LmScorer(args.alpha_from, args.beta_from,
-                          args.language_model_path)
-    ## tune parameters in loop
-    for alpha, beta in params_grid:
-        wer_sum, wer_counter = 0, 0
-        # reset scorer
-        ext_scorer.reset_params(alpha, beta)
-        # beam search using multiple processes
-        beam_search_results = ctc_beam_search_decoder_batch(
-            probs_split=probs_split,
-            vocabulary=data_generator.vocab_list,
-            beam_size=args.beam_size,
-            cutoff_prob=args.cutoff_prob,
-            blank_id=len(data_generator.vocab_list),
-            num_processes=args.num_processes_beam_search,
-            ext_scoring_func=ext_scorer, )
-        for i, beam_search_result in enumerate(beam_search_results):
-            target_transcription = ''.join([
-                data_generator.vocab_list[index] for index in infer_data[i][1]
-            ])
-            wer_sum += wer(target_transcription, beam_search_result[0][1])
-            wer_counter += 1
-
-        print("alpha = %f\tbeta = %f\tWER = %f" %
-              (alpha, beta, wer_sum / wer_counter))
-
-
-def main():
-    paddle.init(use_gpu=args.use_gpu, trainer_count=1)
-    tune()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/deep_speech_2/utils.py b/deep_speech_2/utils.py
deleted file mode 100644
index 9ca363c8f59c2b1cd2885db4b04605c0025998bf..0000000000000000000000000000000000000000
--- a/deep_speech_2/utils.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Contains common utility functions."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-def print_arguments(args):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-        
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args() 
-        print_arguments(args)
-    
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    print("-----  Configuration Arguments -----")
-    for arg, value in vars(args).iteritems():
-        print("%s: %s" % (arg, value))
-    print("------------------------------------")
diff --git a/deep_speech_2/utils/__init__.py b/deep_speech_2/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deep_speech_2/error_rate.py b/deep_speech_2/utils/error_rate.py
similarity index 65%
rename from deep_speech_2/error_rate.py
rename to deep_speech_2/utils/error_rate.py
index 0cf17921c0dd3db051648f93570baf900054bb52..ea829f4703a90babd53e5408cfd30a427430de0d 100644
--- a/deep_speech_2/error_rate.py
+++ b/deep_speech_2/utils/error_rate.py
@@ -10,47 +10,54 @@ import numpy as np
 
 
 def _levenshtein_distance(ref, hyp):
-    """Levenshtein distance is a string metric for measuring the difference between
-    two sequences. Informally, the levenshtein disctance is defined as the minimum
-    number of single-character edits (substitutions, insertions or deletions) 
-    required to change one word into the other. We can naturally extend the edits to 
-    word level when calculate levenshtein disctance for two sentences.
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
     """
-    ref_len = len(ref)
-    hyp_len = len(hyp)
+    m = len(ref)
+    n = len(hyp)
 
     # special case
     if ref == hyp:
         return 0
-    if ref_len == 0:
-        return hyp_len
-    if hyp_len == 0:
-        return ref_len
+    if m == 0:
+        return n
+    if n == 0:
+        return m
 
-    distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)
 
     # initialize distance matrix
-    for j in xrange(hyp_len + 1):
+    for j in xrange(n + 1):
         distance[0][j] = j
-    for i in xrange(ref_len + 1):
-        distance[i][0] = i
 
     # calculate levenshtein distance
-    for i in xrange(1, ref_len + 1):
-        for j in xrange(1, hyp_len + 1):
+    for i in xrange(1, m + 1):
+        prev_row_idx = (i - 1) % 2
+        cur_row_idx = i % 2
+        distance[cur_row_idx][0] = i
+        for j in xrange(1, n + 1):
             if ref[i - 1] == hyp[j - 1]:
-                distance[i][j] = distance[i - 1][j - 1]
+                distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
             else:
-                s_num = distance[i - 1][j - 1] + 1
-                i_num = distance[i][j - 1] + 1
-                d_num = distance[i - 1][j] + 1
-                distance[i][j] = min(s_num, i_num, d_num)
+                s_num = distance[prev_row_idx][j - 1] + 1
+                i_num = distance[cur_row_idx][j - 1] + 1
+                d_num = distance[prev_row_idx][j] + 1
+                distance[cur_row_idx][j] = min(s_num, i_num, d_num)
 
-    return distance[ref_len][hyp_len]
+    return distance[m % 2][n]
 
 
 def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
-    """Calculate word error rate (WER). WER compares reference text and 
+    """Calculate word error rate (WER). WER compares reference text and
     hypothesis text in word-level. WER is defined as:
 
     .. math::
@@ -65,8 +72,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
         Iw is the number of words inserted,
         Nw is the number of words in the reference
 
-    We can use levenshtein distance to calculate WER. Please draw an attention that 
-    empty items will be removed when splitting sentences by delimiter.
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.
 
     :param reference: The reference sentence.
     :type reference: basestring
@@ -95,7 +102,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
     return wer
 
 
-def cer(reference, hypothesis, ignore_case=False):
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
     """Calculate charactor error rate (CER). CER compares reference text and
     hypothesis text in char-level. CER is defined as:
 
@@ -111,10 +118,10 @@ def cer(reference, hypothesis, ignore_case=False):
         Ic is the number of characters inserted
         Nc is the number of characters in the reference
 
-    We can use levenshtein distance to calculate CER. Chinese input should be 
-    encoded to unicode. Please draw an attention that the leading and tailing 
-    white space characters will be truncated and multiple consecutive white 
-    space characters in a sentence will be replaced by one white space character.
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.
 
     :param reference: The reference sentence.
     :type reference: basestring
@@ -122,6 +129,8 @@ def cer(reference, hypothesis, ignore_case=False):
     :type hypothesis: basestring
     :param ignore_case: Whether case-sensitive or not.
     :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
     :return: Character error rate.
     :rtype: float
     :raises ValueError: If the reference length is zero.
@@ -130,8 +139,12 @@ def cer(reference, hypothesis, ignore_case=False):
         reference = reference.lower()
         hypothesis = hypothesis.lower()
 
-    reference = ' '.join(filter(None, reference.split(' ')))
-    hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
+    join_char = ' '
+    if remove_space == True:
+        join_char = ''
+
+    reference = join_char.join(filter(None, reference.split(' ')))
+    hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
 
     if len(reference) == 0:
         raise ValueError("Length of reference should be greater than 0.")
diff --git a/deep_speech_2/utils/tests/test_error_rate.py b/deep_speech_2/utils/tests/test_error_rate.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6bc7442e1f55bcea1f16234301785a884f2249a
--- /dev/null
+++ b/deep_speech_2/utils/tests/test_error_rate.py
@@ -0,0 +1,115 @@
+# -*- coding: utf-8 -*-
+"""Test error rate."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+from utils import error_rate
+
+
+class TestParse(unittest.TestCase):
+    def test_wer_1(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\
+                'night'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
+
+    def test_wer_2(self):
+        ref = 'as any in england i would say said gamewell proudly that is '\
+                'in his day'
+        hyp = 'as any in england i would say said came well proudly that is '\
+                'in his day'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6)
+
+    def test_wer_3(self):
+        ref = 'the lieutenant governor lilburn w boggs afterward governor '\
+                'was a pronounced mormon hater and throughout the period of '\
+                'the troubles he manifested sympathy with the persecutors'
+        hyp = 'the lieutenant governor little bit how bags afterward '\
+                'governor was a pronounced warman hater and throughout the '\
+                'period of th troubles he manifests sympathy with the '\
+                'persecutors'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6)
+
+    def test_wer_4(self):
+        ref = 'the wood flamed up splendidly under the large brewing copper '\
+                'and it sighed so deeply'
+        hyp = 'the wood flame do splendidly under the large brewing copper '\
+                'and its side so deeply'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6)
+
+    def test_wer_5(self):
+        ref = 'all the morning they trudged up the mountain path and at noon '\
+                'unc and ojo sat on a fallen tree trunk and ate the last of '\
+                'the bread which the old munchkin had placed in his pocket'
+        hyp = 'all the morning they trudged up the mountain path and at noon '\
+                'unc in ojo sat on a fallen tree trunk and ate the last of '\
+                'the bread which the old munchkin had placed in his pocket'
+        word_error_rate = error_rate.wer(ref, hyp)
+        self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6)
+
+    def test_wer_6(self):
+        ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
+        word_error_rate = error_rate.wer(ref, ref)
+        self.assertEqual(word_error_rate, 0.0)
+
+    def test_wer_7(self):
+        ref = ' '
+        hyp = 'Hypothesis sentence'
+        with self.assertRaises(ValueError):
+            word_error_rate = error_rate.wer(ref, hyp)
+
+    def test_cer_1(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
+
+    def test_cer_2(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_3(self):
+        ref = 'were wolf'
+        hyp = 'were  wolf'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+
+    def test_cer_4(self):
+        ref = 'werewolf'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertEqual(char_error_rate, 0.0)
+
+    def test_cer_5(self):
+        ref = u'我是中国人'
+        hyp = u'我是 美洲人'
+        char_error_rate = error_rate.cer(ref, hyp)
+        self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
+
+    def test_cer_6(self):
+        ref = u'我 是 中 国 人'
+        hyp = u'我 是 美 洲 人'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
+
+    def test_cer_7(self):
+        ref = u'我是中国人'
+        char_error_rate = error_rate.cer(ref, ref)
+        self.assertFalse(char_error_rate, 0.0)
+
+    def test_cer_8(self):
+        ref = ''
+        hyp = 'Hypothesis'
+        with self.assertRaises(ValueError):
+            char_error_rate = error_rate.cer(ref, hyp)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/deep_speech_2/utils/utility.py b/deep_speech_2/utils/utility.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e489ade6f28fdce5c6b60b47bc919a55549f046
--- /dev/null
+++ b/deep_speech_2/utils/utility.py
@@ -0,0 +1,47 @@
+"""Contains common utility functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import distutils.util
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/dssm/README.md b/dssm/README.md
index d85b35ebd5b616423514243d9b32ec64a9fef198..b65c11df7d00f34b8378c92371858ca383827a1d 100644
--- a/dssm/README.md
+++ b/dssm/README.md
@@ -384,11 +384,13 @@ def _build_rank_model(self):
 ```
 usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
                 [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
-                [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE --model_arch
-                MODEL_ARCH
+                [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
                 [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
                 [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
                 [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
+                [--model_output_prefix MODEL_OUTPUT_PREFIX]
+                [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
+                [-z NUM_BATCHES_TO_SAVE_MODEL]
 
 PaddlePaddle DSSM example
 
@@ -408,9 +410,9 @@ optional arguments:
   -p NUM_PASSES, --num_passes NUM_PASSES
                         number of passes to run(default:10)
   -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank
-                        (default: classification)
-  --model_arch MODEL_ARCH
+                        model type, 0 for classification, 1 for pairwise rank,
+                        2 for regression (default: classification)
+  -a MODEL_ARCH, --model_arch MODEL_ARCH
                         model architecture, 1 for CNN, 0 for FC, 2 for RNN
   --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
                         whether to share network parameters between source and
@@ -426,8 +428,73 @@ optional arguments:
   --use_gpu USE_GPU     whether to use GPU devices (default: False)
   -c CLASS_NUM, --class_num CLASS_NUM
                         number of categories for classification task.
+  --model_output_prefix MODEL_OUTPUT_PREFIX
+                        prefix of the path for model to store, (default: ./)
+  -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
+                        number of batches to output train log, (default: 100)
+  -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
+                        number of batches to test, (default: 200)
+  -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
+                        number of batches to output model, (default: 400)
 ```
 
+重要的参数描述如下
+
+- `train_data_path` 训练数据路径
+- `test_data_path` 测试数据路局，可以不设置
+- `source_dic_path` 源字典字典路径
+- `target_dic_path` 目标字典路径
+- `model_type` 模型的损失函数的类型，分类0，排序1，回归2
+- `model_arch` 模型结构，FC 0， CNN 1, RNN 2
+- `dnn_dims` 模型各层的维度设置，默认为 `256,128,64,32`，即模型有4层，各层维度如上设置
+
+## 用训练好的模型预测
+```
+usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
+                PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
+                [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
+                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
+                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
+                [-c CLASS_NUM]
+
+PaddlePaddle DSSM infer
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --model_path MODEL_PATH
+                        path of model parameters file
+  -i DATA_PATH, --data_path DATA_PATH
+                        path of the dataset to infer
+  -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
+                        path to output the prediction
+  -y MODEL_TYPE, --model_type MODEL_TYPE
+                        model type, 0 for classification, 1 for pairwise rank,
+                        2 for regression (default: classification)
+  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
+                        path of the source's word dic
+  --target_dic_path TARGET_DIC_PATH
+                        path of the target's word dic, if not set, the
+                        `source_dic_path` will be used
+  -a MODEL_ARCH, --model_arch MODEL_ARCH
+                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
+  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
+                        whether to share network parameters between source and
+                        target
+  --share_embed SHARE_EMBED
+                        whether to share word embedding between source and
+                        target
+  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
+                        which means create a 4-layer dnn, demention of each
+                        layer is 256, 128, 64 and 32
+  -c CLASS_NUM, --class_num CLASS_NUM
+                        number of categories for classification task.
+```
+
+部分参数可以参考 `train.py`，重要参数解释如下
+
+- `data_path` 需要预测的数据路径
+- `prediction_output_path` 预测的输出路径
+
 ## 参考文献
 
 1. Huang P S, He X, Gao J, et al. Learning deep structured semantic models for web search using clickthrough data[C]//Proceedings of the 22nd ACM international conference on Conference on information & knowledge management. ACM, 2013: 2333-2338.
diff --git a/dssm/infer.py b/dssm/infer.py
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bf5abb0a8d75bd5b4610c22ece89d53b60cc09a6 100644
--- a/dssm/infer.py
+++ b/dssm/infer.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+import argparse
+import itertools
+
+import reader
+import paddle.v2 as paddle
+from network_conf import DSSM
+from utils import logger, ModelType, ModelArch, load_dic
+
+parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
+parser.add_argument(
+    '--model_path',
+    type=str,
+    required=True,
+    help="path of model parameters file")
+parser.add_argument(
+    '-i',
+    '--data_path',
+    type=str,
+    required=True,
+    help="path of the dataset to infer")
+parser.add_argument(
+    '-o',
+    '--prediction_output_path',
+    type=str,
+    required=True,
+    help="path to output the prediction")
+parser.add_argument(
+    '-y',
+    '--model_type',
+    type=int,
+    required=True,
+    default=ModelType.CLASSIFICATION_MODE,
+    help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
+    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+       ModelType.REGRESSION_MODE))
+parser.add_argument(
+    '-s',
+    '--source_dic_path',
+    type=str,
+    required=False,
+    help="path of the source's word dic")
+parser.add_argument(
+    '--target_dic_path',
+    type=str,
+    required=False,
+    help="path of the target's word dic, if not set, the `source_dic_path` will be used"
+)
+parser.add_argument(
+    '-a',
+    '--model_arch',
+    type=int,
+    required=True,
+    default=ModelArch.CNN_MODE,
+    help="model architecture, %d for CNN, %d for FC, %d for RNN" %
+    (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
+parser.add_argument(
+    '--share_network_between_source_target',
+    type=bool,
+    default=False,
+    help="whether to share network parameters between source and target")
+parser.add_argument(
+    '--share_embed',
+    type=bool,
+    default=False,
+    help="whether to share word embedding between source and target")
+parser.add_argument(
+    '--dnn_dims',
+    type=str,
+    default='256,128,64,32',
+    help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
+)
+parser.add_argument(
+    '-c',
+    '--class_num',
+    type=int,
+    default=0,
+    help="number of categories for classification task.")
+
+args = parser.parse_args()
+args.model_type = ModelType(args.model_type)
+args.model_arch = ModelArch(args.model_arch)
+if args.model_type.is_classification():
+    assert args.class_num > 1, "--class_num should be set in classification task."
+
+layer_dims = map(int, args.dnn_dims.split(','))
+args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
+
+paddle.init(use_gpu=False, trainer_count=1)
+
+
+class Inferer(object):
+    def __init__(self, param_path):
+        logger.info("create DSSM model")
+
+        prediction = DSSM(
+            dnn_dims=layer_dims,
+            vocab_sizes=[
+                len(load_dic(path))
+                for path in [args.source_dic_path, args.target_dic_path]
+            ],
+            model_type=args.model_type,
+            model_arch=args.model_arch,
+            share_semantic_generator=args.share_network_between_source_target,
+            class_num=args.class_num,
+            share_embed=args.share_embed,
+            is_infer=True)()
+
+        # load parameter
+        logger.info("load model parameters from %s" % param_path)
+        self.parameters = paddle.parameters.Parameters.from_tar(
+            open(param_path, 'r'))
+        self.inferer = paddle.inference.Inference(
+            output_layer=prediction, parameters=self.parameters)
+
+    def infer(self, data_path):
+        logger.info("infer data...")
+        dataset = reader.Dataset(
+            train_path=data_path,
+            test_path=None,
+            source_dic_path=args.source_dic_path,
+            target_dic_path=args.target_dic_path,
+            model_type=args.model_type, )
+        infer_reader = paddle.batch(dataset.infer, batch_size=1000)
+        logger.warning('write predictions to %s' % args.prediction_output_path)
+
+        output_f = open(args.prediction_output_path, 'w')
+
+        for id, batch in enumerate(infer_reader()):
+            res = self.inferer.infer(input=batch)
+            predictions = [' '.join(map(str, x)) for x in res]
+            assert len(batch) == len(
+                predictions), "predict error, %d inputs, but %d predictions" % (
+                    len(batch), len(predictions))
+            output_f.write('\n'.join(map(str, predictions)) + '\n')
+
+
+if __name__ == '__main__':
+    inferer = Inferer(args.model_path)
+    inferer.infer(args.data_path)
diff --git a/dssm/network_conf.py b/dssm/network_conf.py
index 916079825010b253e33a8edb230980d8bd613841..04c2b7e2f389d3839b72d005004119a49238444f 100644
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -11,7 +11,8 @@ class DSSM(object):
                  model_arch=ModelArch.create_cnn(),
                  share_semantic_generator=False,
                  class_num=None,
-                 share_embed=False):
+                 share_embed=False,
+                 is_infer=False):
         '''
         @dnn_dims: list of int
             dimentions of each layer in semantic vector generator.
@@ -40,6 +41,7 @@ class DSSM(object):
         self.model_type = ModelType(model_type)
         self.model_arch = ModelArch(model_arch)
         self.class_num = class_num
+        self.is_infer = is_infer
         logger.warning("build DSSM model with config of %s, %s" %
                        (self.model_type, self.model_arch))
         logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
@@ -68,9 +70,6 @@ class DSSM(object):
         self.model_type_creater = _model_type[str(self.model_type)]
 
     def __call__(self):
-        # if self.model_type.is_classification():
-        #     return self._build_classification_model()
-        # return self._build_rank_model()
         return self.model_type_creater()
 
     def create_embedding(self, input, prefix=''):
@@ -103,8 +102,7 @@ class DSSM(object):
         '''
         A GRU sentence vector learner.
         '''
-        gru = paddle.layer.gru_memory(
-            input=emb, )
+        gru = paddle.networks.simple_gru(input=emb, size=256)
         sent_vec = paddle.layer.last_seq(gru)
         return sent_vec
 
@@ -189,8 +187,9 @@ class DSSM(object):
         right_target = paddle.layer.data(
             name='right_target_input',
             type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-        label = paddle.layer.data(
-            name='label_input', type=paddle.data_type.integer_value(1))
+        if not self.is_infer:
+            label = paddle.layer.data(
+                name='label_input', type=paddle.data_type.integer_value(1))
 
         prefixs = '_ _ _'.split(
         ) if self.share_semantic_generator else 'source left right'.split()
@@ -212,12 +211,14 @@ class DSSM(object):
         # cossim score of source and right target
         right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
 
-        # rank cost
-        cost = paddle.layer.rank_cost(left_score, right_score, label=label)
-        # prediction = left_score - right_score
-        # but this operator is not supported currently.
-        # so AUC will not used.
-        return cost, None, None
+        if not self.is_infer:
+            # rank cost
+            cost = paddle.layer.rank_cost(left_score, right_score, label=label)
+            # prediction = left_score - right_score
+            # but this operator is not supported currently.
+            # so AUC will not used.
+            return cost, None, label
+        return right_score
 
     def _build_classification_or_regression_model(self, is_classification):
         '''
@@ -269,39 +270,8 @@ class DSSM(object):
                 input=prediction, label=label)
         else:
             prediction = paddle.layer.cos_sim(*semantics)
-            cost = paddle.layer.mse_cost(prediction, label)
-        return cost, prediction, label
-
-
-class RankMetrics(object):
-    '''
-    A custom metrics to calculate AUC.
+            cost = paddle.layer.square_error_cost(prediction, label)
 
-    Paddle's rank model do not support auc evaluator directly,
-    to make it, infer all the outputs and use python to calculate
-    the metrics.
-    '''
-
-    def __init__(self, model_parameters, left_score_layer, right_score_layer,
-                 label):
-        '''
-        @model_parameters: dict
-            model's parameters
-        @left_score_layer: paddle.layer
-            left part's score
-        @right_score_laeyr: paddle.layer
-            right part's score
-        @label: paddle.data_layer
-            label input
-        '''
-        self.inferer = paddle.inference.Inference(
-            output_layer=[left_score_layer, right_score_layer],
-            parameters=model_parameters)
-
-    def test(self, input):
-        scores = []
-        for id, rcd in enumerate(input()):
-            # output [left_score, right_score, label]
-            res = self.inferer(input=input)
-            scores.append(res)
-        print scores
+        if not self.is_infer:
+            return cost, prediction, label
+        return prediction
diff --git a/dssm/reader.py b/dssm/reader.py
index 45cf7449eea631a0eceb9b4ff78c0b7f20cc9026..677072dae985980fab3da4dd09893721f84866fd 100644
--- a/dssm/reader.py
+++ b/dssm/reader.py
@@ -23,6 +23,7 @@ class Dataset(object):
 
         assert isinstance(model_type, ModelType)
         self.record_reader = _record_reader[model_type.mode]
+        self.is_infer = False
 
     def train(self):
         '''
@@ -37,11 +38,17 @@ class Dataset(object):
         '''
         Load testset.
         '''
-        logger.info("[reader] load testset from %s" % self.test_path)
+        # logger.info("[reader] load testset from %s" % self.test_path)
         with open(self.test_path) as f:
             for line_id, line in enumerate(f):
                 yield self.record_reader(line)
 
+    def infer(self):
+        self.is_infer = True
+        with open(self.train_path) as f:
+            for line in f:
+                yield self.record_reader(line)
+
     def _read_classification_record(self, line):
         '''
         data format:
@@ -56,8 +63,10 @@ class Dataset(object):
             "<source words> [TAB] <target words> [TAB] <label>'"
         source = sent2ids(fs[0], self.source_dic)
         target = sent2ids(fs[1], self.target_dic)
-        label = int(fs[2])
-        return (source, target, label, )
+        if not self.is_infer:
+            label = int(fs[2])
+            return (source, target, label, )
+        return source, target
 
     def _read_regression_record(self, line):
         '''
@@ -73,8 +82,10 @@ class Dataset(object):
             "<source words> [TAB] <target words> [TAB] <label>'"
         source = sent2ids(fs[0], self.source_dic)
         target = sent2ids(fs[1], self.target_dic)
-        label = float(fs[2])
-        return (source, target, [label], )
+        if not self.is_infer:
+            label = float(fs[2])
+            return (source, target, [label], )
+        return source, target
 
     def _read_rank_record(self, line):
         '''
@@ -89,9 +100,10 @@ class Dataset(object):
         source = sent2ids(fs[0], self.source_dic)
         left_target = sent2ids(fs[1], self.target_dic)
         right_target = sent2ids(fs[2], self.target_dic)
-        label = int(fs[3])
-
-        return (source, left_target, right_target, label)
+        if not self.is_infer:
+            label = int(fs[3])
+            return (source, left_target, right_target, label)
+        return source, left_target, right_target
 
 
 if __name__ == '__main__':
diff --git a/dssm/train.py b/dssm/train.py
index a4678b15b54137a4fa7556f19b066ad29aae99c5..d1dd932860b0b70b461acdcf1b1b48258c3d1b30 100644
--- a/dssm/train.py
+++ b/dssm/train.py
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
-import gzip
 
 import paddle.v2 as paddle
 from network_conf import DSSM
 import reader
-from utils import TaskType, load_dic, logger, ModelType, ModelArch
+from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
 
 parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
 
@@ -56,6 +55,7 @@ parser.add_argument(
     % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
        ModelType.REGRESSION_MODE))
 parser.add_argument(
+    '-a',
     '--model_arch',
     type=int,
     required=True,
@@ -91,6 +91,29 @@ parser.add_argument(
     type=int,
     default=0,
     help="number of categories for classification task.")
+parser.add_argument(
+    '--model_output_prefix',
+    type=str,
+    default="./",
+    help="prefix of the path for model to store, (default: ./)")
+parser.add_argument(
+    '-g',
+    '--num_batches_to_log',
+    type=int,
+    default=100,
+    help="number of batches to output train log, (default: 100)")
+parser.add_argument(
+    '-e',
+    '--num_batches_to_test',
+    type=int,
+    default=200,
+    help="number of batches to test, (default: 200)")
+parser.add_argument(
+    '-z',
+    '--num_batches_to_save_model',
+    type=int,
+    default=400,
+    help="number of batches to output model, (default: 400)")
 
 # arguments check.
 args = parser.parse_args()
@@ -100,10 +123,7 @@ if args.model_type.is_classification():
     assert args.class_num > 1, "--class_num should be set in classification task."
 
 layer_dims = [int(i) for i in args.dnn_dims.split(',')]
-target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
-
-model_save_name_prefix = "dssm_pass_%s_%s" % (args.model_type,
-                                              args.model_arch, )
+args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
 
 
 def train(train_data_path=None,
@@ -174,15 +194,10 @@ def train(train_data_path=None,
 
     trainer = paddle.trainer.SGD(
         cost=cost,
-        extra_layers=None,
+        extra_layers=paddle.evaluator.auc(input=prediction, label=label)
+        if not model_type.is_rank() else None,
         parameters=parameters,
         update_equation=adam_optimizer)
-    # trainer = paddle.trainer.SGD(
-    #     cost=cost,
-    #     extra_layers=paddle.evaluator.auc(input=prediction, label=label)
-    #     if prediction and model_type.is_classification() else None,
-    #     parameters=parameters,
-    #     update_equation=adam_optimizer)
 
     feeding = {}
     if model_type.is_classification() or model_type.is_regression():
@@ -200,21 +215,29 @@ def train(train_data_path=None,
         Define batch handler
         '''
         if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                logger.info("Pass %d, Batch %d, Cost %f, %s\n" % (
+            # output train log
+            if event.batch_id % args.num_batches_to_log == 0:
+                logger.info("Pass %d, Batch %d, Cost %f, %s" % (
                     event.pass_id, event.batch_id, event.cost, event.metrics))
 
-        if isinstance(event, paddle.event.EndPass):
-            if test_reader is not None:
-                if model_type.is_classification():
-                    result = trainer.test(reader=test_reader, feeding=feeding)
-                    logger.info("Test at Pass %d, %s \n" % (event.pass_id,
-                                                            result.metrics))
-                else:
-                    result = None
-            with gzip.open("dssm_%s_pass_%05d.tar.gz" %
-                           (model_save_name_prefix, event.pass_id), "w") as f:
-                parameters.to_tar(f)
+            # test model
+            if event.batch_id > 0 and event.batch_id % args.num_batches_to_test == 0:
+                if test_reader is not None:
+                    if model_type.is_classification():
+                        result = trainer.test(
+                            reader=test_reader, feeding=feeding)
+                        logger.info("Test at Pass %d, %s" % (event.pass_id,
+                                                             result.metrics))
+                    else:
+                        result = None
+            # save model
+            if event.batch_id > 0 and event.batch_id % args.num_batches_to_save_model == 0:
+                model_desc = "{type}_{arch}".format(
+                    type=str(args.model_type), arch=str(args.model_arch))
+                with open("%sdssm_%s_pass_%05d.tar" %
+                          (args.model_output_prefix, model_desc,
+                           event.pass_id), "w") as f:
+                    parameters.to_tar(f)
 
     trainer.train(
         reader=train_reader,
@@ -226,6 +249,7 @@ def train(train_data_path=None,
 
 
 if __name__ == '__main__':
+    display_args(args)
     train(
         train_data_path=args.train_data_path,
         test_data_path=args.test_data_path,
diff --git a/dssm/utils.py b/dssm/utils.py
index de7eb7e648d7d3422962ac360b057c5af5285490..7bcbec6ebfb2e41d94e49b4e39eaf106e414c103 100644
--- a/dssm/utils.py
+++ b/dssm/utils.py
@@ -1,3 +1,4 @@
+import logging
 import paddle
 
 UNK = 0
@@ -126,6 +127,12 @@ def load_dic(path):
     return dic
 
 
+def display_args(args):
+    logger.info("arguments passed by command line:")
+    for k, v in sorted(v for v in vars(args).items()):
+        logger.info("{}:\t{}".format(k, v))
+
+
 if __name__ == '__main__':
     t = TaskType(1)
     t = TaskType.create_train()
diff --git a/ltr/README.md b/ltr/README.md
index 11d17f8b5d3f42be6c512e10c8e013919a43fb53..57b0d275ee55c27007ef5686d02aa61ef0c46e8e 100644
--- a/ltr/README.md
+++ b/ltr/README.md
@@ -4,14 +4,14 @@
 
 RankNet模型在命令行输入：
 
-```python
-python ranknet.py
+```bash
+bash ./run_ranknet.sh
 ```
 
 LambdaRank模型在命令行输入：
 
-```python
-python lambda_rank.py
+```bash
+bash ./run_lambdarank.sh
 ```
 
 用户只需要使用以上命令就完成排序模型的训练和预测，程序会自动下载内置数据集，无需手动下载。
@@ -54,7 +54,7 @@ python lambda_rank.py
 
 例如调用接口
 
-```python
+```bash
 pairwise_train_dataset = functools.partial(paddle.dataset.mq2007.train, format="pairwise")
 for label, left_doc, right_doc in pairwise_train_dataset():
     ...
@@ -104,7 +104,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\fra
 
 由于Pairwise中的网络结构是左右对称，可定义一半网络结构，另一半共享网络参数。在PaddlePaddle中允许网络结构中共享连接，具有相同名字的参数将会共享参数。使用PaddlePaddle实现RankNet排序模型，定义网络结构的示例代码如下：
 
-```python
+```bash
 import paddle.v2 as paddle
 
 def half_ranknet(name_prefix, input_dim):
@@ -150,7 +150,7 @@ def ranknet(input_dim):
 RankNet的训练只需要运行命令：
 
 ```python
-python ranknet.py
+run ./run_ranknet.sh
 ```
 将会自动下载数据，训练RankNet模型，并将每个轮次的模型参数存储下来。
 
@@ -277,7 +277,7 @@ def lambda_rank(input_dim):
 训练LambdaRank模型只需要运行命令：
 
 ```python
-python lambda_rank.py
+bash ./run_lambdarank.sh
 ```
 
 脚本会自动下载数据，训练LambdaRank模型，并将每个轮次的模型存储下来。
diff --git a/ltr/index.html b/ltr/index.html
index d0d0c0ea0b2d437fce60348d8c431d0f202b529b..ead71e3e19b732034c3f25cd6427d2fa0098aa31 100644
--- a/ltr/index.html
+++ b/ltr/index.html
@@ -46,14 +46,14 @@
 
 RankNet模型在命令行输入：
 
-```python
-python ranknet.py
+```bash
+bash ./run_ranknet.sh
 ```
 
 LambdaRank模型在命令行输入：
 
-```python
-python lambda_rank.py
+```bash
+bash ./run_lambdarank.sh
 ```
 
 用户只需要使用以上命令就完成排序模型的训练和预测，程序会自动下载内置数据集，无需手动下载。
@@ -96,7 +96,7 @@ python lambda_rank.py
 
 例如调用接口
 
-```python
+```bash
 pairwise_train_dataset = functools.partial(paddle.dataset.mq2007.train, format="pairwise")
 for label, left_doc, right_doc in pairwise_train_dataset():
     ...
@@ -146,7 +146,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\fra
 
 由于Pairwise中的网络结构是左右对称，可定义一半网络结构，另一半共享网络参数。在PaddlePaddle中允许网络结构中共享连接，具有相同名字的参数将会共享参数。使用PaddlePaddle实现RankNet排序模型，定义网络结构的示例代码如下：
 
-```python
+```bash
 import paddle.v2 as paddle
 
 def half_ranknet(name_prefix, input_dim):
@@ -192,7 +192,7 @@ def ranknet(input_dim):
 RankNet的训练只需要运行命令：
 
 ```python
-python ranknet.py
+run ./run_ranknet.sh
 ```
 将会自动下载数据，训练RankNet模型，并将每个轮次的模型参数存储下来。
 
@@ -319,7 +319,7 @@ def lambda_rank(input_dim):
 训练LambdaRank模型只需要运行命令：
 
 ```python
-python lambda_rank.py
+bash ./run_lambdarank.sh
 ```
 
 脚本会自动下载数据，训练LambdaRank模型，并将每个轮次的模型存储下来。
diff --git a/ltr/lambda_rank.py b/ltr/lambda_rank.py
index 123edc353cee9447a801b9300099df9d8fd64559..5318b7ce6067375e2e497923a2422e18b08a8261 100644
--- a/ltr/lambda_rank.py
+++ b/ltr/lambda_rank.py
@@ -3,6 +3,7 @@ import gzip
 import paddle.v2 as paddle
 import numpy as np
 import functools
+import argparse
 
 
 def lambda_rank(input_dim):
@@ -117,6 +118,15 @@ def lambda_rank_infer(pass_id):
 
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='LambdaRank demo')
+    parser.add_argument("--run_type", type=str, help="run type is train|infer")
+    parser.add_argument(
+        "--num_passes",
+        type=int,
+        help="num of passes in train| infer pass number of model")
+    args = parser.parse_args()
     paddle.init(use_gpu=False, trainer_count=1)
-    train_lambda_rank(2)
-    lambda_rank_infer(pass_id=1)
+    if args.run_type == "train":
+        train_lambda_rank(args.num_passes)
+    elif args.run_type == "infer":
+        lambda_rank_infer(pass_id=args.num_passes - 1)
diff --git a/ltr/metrics.py b/ltr/metrics.py
index f9f9277c6f761129aa11d53200b6fbc1de9657c2..12a77434bf0f90f87f5754d1dbef4dc4435cba21 100644
--- a/ltr/metrics.py
+++ b/ltr/metrics.py
@@ -19,7 +19,7 @@ def ndcg(score_list):
         n = len(score_list)
         cost = .0
         for i in range(n):
-            cost += float(score_list[i]) / np.log((i + 1) + 1)
+            cost += float(np.power(2, score_list[i])) / np.log((i + 1) + 1)
         return cost
 
     dcg_cost = dcg(score_list)
@@ -28,14 +28,11 @@ def ndcg(score_list):
     return dcg_cost / ideal_cost
 
 
-class NdcgTest(unittest.TestCase):
-    def __init__(self):
-        pass
-
-    def runcase(self):
+class TestNDCG(unittest.TestCase):
+    def test_array(self):
         a = [3, 2, 3, 0, 1, 2]
         value = ndcg(a)
-        self.assertAlmostEqual(0.961, value, places=3)
+        self.assertAlmostEqual(0.9583, value, places=3)
 
 
 if __name__ == '__main__':
diff --git a/ltr/ranknet.py b/ltr/ranknet.py
index 862d5dea7572085a55823ce29e5a16b3bfd40a3e..f6327f4a8c472c933d9a2cd433e1f315f817f960 100644
--- a/ltr/ranknet.py
+++ b/ltr/ranknet.py
@@ -5,6 +5,7 @@ import functools
 import paddle.v2 as paddle
 import numpy as np
 from metrics import ndcg
+import argparse
 
 # ranknet is the classic pairwise learning to rank algorithm
 # http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf
@@ -102,9 +103,9 @@ def ranknet_infer(pass_id):
     feature_dim = 46
 
     # we just need half_ranknet to predict a rank score, which can be used in sort documents
-    output = half_ranknet("left", feature_dim)
+    output = half_ranknet("infer", feature_dim)
     parameters = paddle.parameters.Parameters.from_tar(
-        gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1)))
+        gzip.open("ranknet_params_%d.tar.gz" % (pass_id)))
 
     # load data of same query and relevance documents, need ranknet to rank these candidates
     infer_query_id = []
@@ -118,18 +119,27 @@ def ranknet_infer(pass_id):
 
     for query_id, relevance_score, feature_vector in plain_txt_test():
         infer_query_id.append(query_id)
-        infer_data.append(feature_vector)
+        infer_data.append([feature_vector])
 
     # predict score of infer_data document. Re-sort the document base on predict score
     # in descending order. then we build the ranking documents
     scores = paddle.infer(
         output_layer=output, parameters=parameters, input=infer_data)
+    print scores
     for query_id, score in zip(infer_query_id, scores):
         print "query_id : ", query_id, " ranknet rank document order : ", score
 
 
 if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Ranknet demo')
+    parser.add_argument("--run_type", type=str, help="run type is train|infer")
+    parser.add_argument(
+        "--num_passes",
+        type=int,
+        help="num of passes in train| infer pass number of model")
+    args = parser.parse_args()
     paddle.init(use_gpu=False, trainer_count=4)
-    pass_num = 2
-    train_ranknet(pass_num)
-    ranknet_infer(pass_id=pass_num - 1)
+    if args.run_type == "train":
+        train_ranknet(args.num_passes)
+    elif args.run_type == "infer":
+        ranknet_infer(pass_id=args.pass_num - 1)
diff --git a/ltr/run_lambdarank.sh b/ltr/run_lambdarank.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9546be2cb1fd19d66352091a88379f61283a1f6b
--- /dev/null
+++ b/ltr/run_lambdarank.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+python lambda_rank.py \
+       --run_type="train" \
+       --num_passes=10 \
+       2>&1 | tee lambdarank_train.log
+
+python lambda_rank.py \
+       --run_type="infer" \
+       --num_passes=10 \
+       2>&1 | tee lambdarank_infer.log
diff --git a/ltr/run_ranknet.sh b/ltr/run_ranknet.sh
new file mode 100644
index 0000000000000000000000000000000000000000..16d53461f570e0618ca038c7fff31fb9bf28b57e
--- /dev/null
+++ b/ltr/run_ranknet.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+python ranknet.py \
+       --run_type="train" \
+       --num_passes=10 \
+       2>&1 | tee rankenet_train.log
+
+python ranknet.py \
+       --run_type="infer" \
+       --num_passes=10 \
+       2>&1 | tee ranknet_infer.log
diff --git a/nce_cost/train.py b/nce_cost/train.py
index 4ab5043725805003cf151c6d0c8af8dbbc8c199f..3babf7fe0963fcff54430cd174b0af523e68846b 100644
--- a/nce_cost/train.py
+++ b/nce_cost/train.py
@@ -43,7 +43,10 @@ def train(model_save_dir):
                 parameters.to_tar(f)
 
     trainer.train(
-        paddle.batch(paddle.dataset.imikolov.train(word_dict, 5), 64),
+        paddle.batch(
+            paddle.reader.shuffle(
+                lambda: paddle.dataset.imikolov.train(word_dict, 5)(),
+                buf_size=1000), 64),
         num_passes=1000,
         event_handler=event_handler)
 
diff --git a/sequence_tagging_for_ner/data/download.sh b/sequence_tagging_for_ner/data/download.sh
index fc9de3d7f2bacb82361bffbf53f109ff6f3c9060..99d81c1e0949e47187cd082947117eb4e6bd888d 100644
--- a/sequence_tagging_for_ner/data/download.sh
+++ b/sequence_tagging_for_ner/data/download.sh
@@ -1,4 +1,8 @@
-wget http://cs224d.stanford.edu/assignment2/assignment2.zip
+if [ -f assignment2.zip ]; then
+    echo "data exist"
+else
+    wget http://cs224d.stanford.edu/assignment2/assignment2.zip
+fi
 
 if [ $? -eq 0  ];then
     unzip assignment2.zip
diff --git a/sequence_tagging_for_ner/reader.py b/sequence_tagging_for_ner/reader.py
index 2662abe80b5bf18459c7ffb6c48fdbf73a3bb970..5050d0bf499e59db505758b0af9eed71e6af7de7 100644
--- a/sequence_tagging_for_ner/reader.py
+++ b/sequence_tagging_for_ner/reader.py
@@ -21,7 +21,7 @@ def canonicalize_word(word, wordset=None, digits=True):
         if (wordset != None) and (word in wordset): return word
         word = canonicalize_digits(word)  # try to canonicalize numbers
     if (wordset == None) or (word in wordset): return word
-    else: return "<UNK>"  # unknown token
+    else: return "UUUNKKK"  # unknown token
 
 
 def data_reader(data_file, word_dict, label_dict):
@@ -35,7 +35,7 @@ def data_reader(data_file, word_dict, label_dict):
     """
 
     def reader():
-        UNK_IDX = word_dict["<UNK>"]
+        UNK_IDX = word_dict["UUUNKKK"]
 
         sentence = []
         labels = []
diff --git a/sequence_tagging_for_ner/train.py b/sequence_tagging_for_ner/train.py
index dd041b6aaaebee0e263abf4bea035d578344409f..5facfeda0d7152bdc065e141d73392eb51e4c2f8 100644
--- a/sequence_tagging_for_ner/train.py
+++ b/sequence_tagging_for_ner/train.py
@@ -106,4 +106,5 @@ if __name__ == "__main__":
         test_data_file="data/test",
         vocab_file="data/vocab.txt",
         target_file="data/target.txt",
-        emb_file="data/wordVectors.txt")
+        emb_file="data/wordVectors.txt",
+        model_save_dir="model/")
diff --git a/ssd/README.md b/ssd/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9467c4385b02bc9e11058e55568903808fc48473
--- /dev/null
+++ b/ssd/README.md
@@ -0,0 +1,226 @@
+# SSD目标检测
+## 概述
+SSD全称为Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具体参见论文\[[1](#引用)\]。SSD算法主要特点是检测速度快且检测精度高。PaddlePaddle已集成SSD算法，本示例旨在介绍如何使用PaddlePaddle中的SSD模型进行目标检测。下文展开顺序为：首先简要介绍SSD原理，然后介绍示例包含文件及作用，接着介绍如何在PASCAL VOC数据集上训练、评估及检测，最后简要介绍如何在自有数据集上使用SSD。
+## SSD原理
+SSD使用一个卷积神经网络实现“端到端”的检测，所谓“端到端”指输入为原始图像，输出为检测结果，无需借助外部工具或流程进行特征提取、候选框生成等。论文中SSD的基础模型为VGG16\[[2](#引用)\]，不同于原始VGG16网络模型，SSD做了一些改变：
+
+1. 将最后的fc6、fc7全连接层变为卷积层，卷积层参数通过对原始fc6、fc7参数采样得到。
+2. 将pool5层的参数由2x2-s2（kernel大小为2x2，stride size为2）更改为3x3-s1-p1（kernel大小为3x3，stride size为1，padding size为1）。
+3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层，priorbox层的主要目的是根据输入的特征图（feature map）生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。
+
+下图为模型（300x300）的总体结构：
+
+<p align="center">
+<img src="images/ssd_network.png" width="900" height="250" hspace='10'/> <br/>
+图1. SSD网络结构
+</p>
+
+图中每个矩形盒子代表一个卷积层，最后的两个矩形框分别表示汇总各卷积层输出结果和后处理阶段。具体地，在预测阶段网络会输出一组候选矩形框，每个矩形包含两类信息：位置和类别得分，图中倒数第二个矩形框即表示网络的检测结果的汇总处理，由于候选矩形框数量较多且很多矩形框重叠严重，这时需要经过后处理来筛选出质量较高的少数矩形框，这里的后处理主要指非极大值抑制（Non-maximum Suppression）。
+
+从SSD的网络结构可以看出，候选矩形框在多个特征图（feature map上）生成，不同的feature map具有的感受野不同，这样可以在不同尺度扫描图像，相对于其他检测方法可以生成更丰富的候选框，从而提高检测精度；另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算，整个过程只需要一个卷积神经网络完成，所以速度较快。
+
+## 示例总览
+本示例共包含如下文件：
+
+<table>
+<caption>表1. 示例文件</caption>
+<tr><th>文件</th><th>用途</th></tr>
+<tr><td>train.py</td><td>训练脚本</td></tr>
+<tr><td>eval.py</td><td>评估脚本，用于评估训好模型</td></tr>
+<tr><td>infer.py</td><td>检测脚本，给定图片及模型，实施检测</td></tr>
+<tr><td>visual.py</td><td>检测结果可视化</td></tr>
+<tr><td>image_util.py</td><td>图像预处理所需公共函数</td></tr>
+<tr><td>data_provider.py</td><td>数据处理脚本，生成训练、评估或检测所需数据</td></tr>
+<tr><td>config/pascal_voc_conf.py</td><td>神经网络超参数配置文件</td></tr>
+<tr><td>data/label_list</td><td>类别列表</td></tr>
+<tr><td>data/prepare_voc_data.py</td><td>准备训练PASCAL VOC数据列表</td></tr>
+</table>
+
+训练阶段需要对数据做预处理，包括裁剪、采样等，这部分操作在```image_util.py```和```data_provider.py```中完成。值得注意的是，```config/vgg_config.py```为参数配置文件，包括训练参数、神经网络参数等，本配置文件包含参数是针对PASCAL VOC数据配置的，当训练自有数据时，需要仿照该文件配置新的参数。```data/prepare_voc_data.py```脚本用来生成文件列表，包括切分训练集和测试集，使用时需要用户事先下载并解压数据，默认采用VOC2007和VOC2012。
+
+## PASCAL VOC数据集
+### 数据准备
+首先需要下载数据集，VOC2007\[[3](#引用)\]和VOC2012\[[4](#引用)\]，VOC2007包含训练集和测试集，VOC2012只包含训练集，将下载好的数据解压，目录结构为```data/VOCdevkit/VOC2007```和```data/VOCdevkit/VOC2012```。进入```data```目录，运行```python prepare_voc_data.py```即可生成```trainval.txt```和```test.txt```。核心函数为：
+
+```python
+def prepare_filelist(devkit_dir, years, output_dir):
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = walk_dir(devkit_dir, year)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as ftest:
+        for item in test_list:
+            ftest.write(item[0] + ' ' + item[1] + '\n')
+```
+
+该函数首先对每一年（year）的数据进行处理，然后将训练图像的文件路径列表进行随机打乱，最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下，且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据，也包含VOC2012的训练数据，```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例：
+
+```
+VOCdevkit/VOC2007/JPEGImages/000005.jpg VOCdevkit/VOC2007/Annotations/000005.xml
+VOCdevkit/VOC2007/JPEGImages/000007.jpg VOCdevkit/VOC2007/Annotations/000007.xml
+VOCdevkit/VOC2007/JPEGImages/000009.jpg VOCdevkit/VOC2007/Annotations/000009.xml
+```
+
+文件共两个字段，第一个字段为图像文件的相对路径，第二个字段为对应标注文件的相对路径。
+
+### 预训练模型准备
+下载预训练的VGG-16模型，我们提供了一个转换好的模型，具体下载地址为：http://paddlepaddle.bj.bcebos.com/model_zoo/detection/ssd_model/vgg_model.tar.gz ，下载好模型后，放置路径为```vgg/vgg_model.tar.gz```。
+### 模型训练
+直接执行```python train.py```即可进行训练。需要注意本示例仅支持CUDA GPU环境，无法在CPU上训练，主要因为使用CPU训练速度很慢，实践中一般使用GPU来处理图像任务，这里实现采用硬编码方式使用cuDNN，不提供CPU版本。```train.py```的一些关键执行逻辑：
+
+```python
+paddle.init(use_gpu=True, trainer_count=4)
+data_args = data_provider.Settings(
+                data_dir='./data',
+                label_file='label_list',
+                resize_h=cfg.IMG_HEIGHT,
+                resize_w=cfg.IMG_WIDTH,
+                mean_value=[104,117,124])
+train(train_file_list='./data/trainval.txt',
+      dev_file_list='./data/test.txt',
+      data_args=data_args,
+      init_model_path='./vgg/vgg_model.tar.gz')
+```
+
+主要包括：
+
+1. 调用```paddle.init```指定使用4卡GPU训练。
+2. 调用```data_provider.Settings```配置数据预处理所需参数，其中```cfg.IMG_HEIGHT```和```cfg.IMG_WIDTH```在配置文件```config/vgg_config.py```中设置，这里均为300，300x300是一个典型配置，兼顾效率和检测精度，也可以通过修改配置文件扩展到512x512。
+3. 调用```train```执行训练，其中```train_file_list```指定训练数据列表，```dev_file_list```指定评估数据列表，```init_model_path```指定预训练模型位置。
+4. 训练过程中会打印一些日志信息，每训练1个batch会输出当前的轮数、当前batch的cost及mAP（mean Average Precision，平均精度均值），每训练一个pass，会保存一次模型，默认保存在```checkpoints```目录下（注：需事先创建）。
+
+下面给出SDD300x300在VOC数据集（train包括07+12，test为07）上的mAP曲线，迭代140轮mAP可达到71.52%。
+
+<p align="center">
+<img src="images/SSD300x300_map.png" hspace='10'/> <br/>
+图2. SSD300x300 mAP收敛曲线
+</p>
+
+
+### 模型评估
+执行```python eval.py```即可对模型进行评估，```eval.py```的关键执行逻辑如下：
+
+```python
+paddle.init(use_gpu=True, trainer_count=4)  # use 4 gpus
+
+data_args = data_provider.Settings(
+    data_dir='./data',
+    label_file='label_list',
+    resize_h=cfg.IMG_HEIGHT,
+    resize_w=cfg.IMG_WIDTH,
+    mean_value=[104, 117, 124])
+
+eval(
+    eval_file_list='./data/test.txt',
+    batch_size=4,
+    data_args=data_args,
+    model_path='models/pass-00000.tar.gz')
+```
+
+调用```paddle.init```指定使用4卡GPU评估；```data_provider.Settings```参见训练阶段的配置；调用```eval```执行评估，其中```eval_file_list```指定评估数据列表，```batch_size```指定评估时batch size的大小，```model_path ```指定模型位置。评估结束会输出```loss```信息和```mAP```信息。
+
+### 图像检测
+执行```python infer.py```即可使用训练好的模型对图片实施检测，```infer.py```关键逻辑如下：
+
+```python
+infer(
+    eval_file_list='./data/infer.txt',
+    save_path='infer.res',
+    data_args=data_args,
+    batch_size=4,
+    model_path='models/pass-00000.tar.gz',
+    threshold=0.3)
+```
+
+其中```eval_file_list```指定图像路径列表；```save_path```指定预测结果保存路径；```data_args```如上；```batch_size```为每多少样本预测一次；```model_path```指模型的位置；```threshold```为置信度阈值，只有得分大于或等于该值的才会输出。下面给出```infer.res```的一些输出样例：
+
+```
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 12 0.997844 131.255611777 162.271582842 396.475315094 334.0
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.998557 229.160234332 49.5991278887 314.098775387 312.913876176
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.372522 187.543615699 133.727034628 345.647156239 327.448492289
+...
+```
+
+一共包含4个字段，以tab分割，第一个字段是检测图像路径，第二字段为检测矩形框内类别，第三个字段是置信度，第四个字段是4个坐标值（以空格分割）。
+
+示例还提供了一个可视化脚本，直接运行```python visual.py```即可，须指定输出检测结果路径及输出目录，默认可视化后图像保存在```./visual_res```，下面是用训练好的模型infer部分图像并可视化的效果：
+
+<p align="center">
+<img src="images/vis_1.jpg" height=150 width=200 hspace='10'/>
+<img src="images/vis_2.jpg" height=150 width=200 hspace='10'/>
+<img src="images/vis_3.jpg" height=150 width=100 hspace='10'/>
+<img src="images/vis_4.jpg" height=150 width=200 hspace='10'/> <br />
+图3. SSD300x300 检测可视化示例
+</p>
+
+
+## 自有数据集
+在自有数据上训练PaddlePaddle SSD需要完成两个关键准备，首先需要适配网络可以接受的输入格式，这里提供一个推荐的结构，以```train.txt```为例
+
+```
+image00000_file_path    image00000_annotation_file_path
+image00001_file_path    image00001_annotation_file_path
+image00002_file_path    image00002_annotation_file_path
+...
+```
+
+文件共两列，以空白符分割，第一列为图像文件的路径，第二列为对应标注数据的文件路径。对图像文件的读取比较直接，略微复杂的是对标注数据的解析，本示例中标注数据使用xml文件存储，所以需要在```data_provider.py```中对xml解析，核心逻辑如下：
+
+```python
+bbox_labels = []
+root = xml.etree.ElementTree.parse(label_path).getroot()
+for object in root.findall('object'):
+    bbox_sample = []
+    # start from 1
+    bbox_sample.append(float(settings.label_list.index(
+         object.find('name').text)))
+    bbox = object.find('bndbox')
+    difficult = float(object.find('difficult').text)
+    bbox_sample.append(float(bbox.find('xmin').text)/img_width)
+    bbox_sample.append(float(bbox.find('ymin').text)/img_height)
+    bbox_sample.append(float(bbox.find('xmax').text)/img_width)
+    bbox_sample.append(float(bbox.find('ymax').text)/img_height)
+    bbox_sample.append(difficult)
+    bbox_labels.append(bbox_sample)
+```
+
+这里一条标注数据包括：label、xmin、ymin、xmax、ymax和is\_difficult，is\_difficult表示该object是否为难例，实际中如果不需要，只需把该字段置零即可。自有数据也需要提供对应的解析逻辑，假设标注数据（比如image00000\_annotation\_file\_path）存储格式如下：
+
+```
+label1 xmin1 ymin1 xmax1 ymax1
+label2 xmin2 ymin2 xmax2 ymax2
+...
+```
+
+每行对应一个物体，共5个字段，第一个为label（注背景为0，需从1编号），剩余4个为坐标，对应的解析逻辑可更改为如下：
+
+```
+bbox_labels = []
+with open(label_path) as flabel:
+    for line in flabel:
+        bbox_sample = []
+        bbox = [float(i) for i in line.strip().split()]
+        label = bbox[0]
+        bbox_sample.append(label)
+        bbox_sample.append(bbox[1]/float(img_width))
+        bbox_sample.append(bbox[2]/float(img_height))
+        bbox_sample.append(bbox[3]/float(img_width))
+        bbox_sample.append(bbox[4]/float(img_height))
+        bbox_sample.append(0.0)
+        bbox_labels.append(bbox_sample)
+```
+
+另一个重要的事情就是根据图像大小及检测物体的大小等更改网络结构的配置，主要是仿照```config/vgg_config.py```创建自己的配置文件，参数设置经验请参照论文\[[1](#引用)\]。
+
+## 引用
+1. Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. [SSD: Single shot multibox detector](https://arxiv.org/abs/1512.02325). European conference on computer vision. Springer, Cham, 2016.
+2. Simonyan, Karen, and Andrew Zisserman. [Very deep convolutional networks for large-scale image recognition](https://arxiv.org/abs/1409.1556). arXiv preprint arXiv:1409.1556 (2014).
+3. [The PASCAL Visual Object Classes Challenge 2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html)
+4. [Visual Object Classes Challenge 2012 (VOC2012)](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html)
diff --git a/ssd/config/__init__.py b/ssd/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ssd/config/pascal_voc_conf.py b/ssd/config/pascal_voc_conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..318b9ae6798be4855b9fdabdee85f690eb3139bc
--- /dev/null
+++ b/ssd/config/pascal_voc_conf.py
@@ -0,0 +1,91 @@
+from easydict import EasyDict as edict
+import numpy as np
+
+__C = edict()
+cfg = __C
+
+__C.TRAIN = edict()
+
+__C.IMG_WIDTH = 300
+__C.IMG_HEIGHT = 300
+__C.IMG_CHANNEL = 3
+__C.CLASS_NUM = 21
+__C.BACKGROUND_ID = 0
+
+# training settings
+__C.TRAIN.LEARNING_RATE = 0.001 / 4
+__C.TRAIN.MOMENTUM = 0.9
+__C.TRAIN.BATCH_SIZE = 32
+__C.TRAIN.NUM_PASS = 200
+__C.TRAIN.L2REGULARIZATION = 0.0005 * 4
+__C.TRAIN.LEARNING_RATE_DECAY_A = 0.1
+__C.TRAIN.LEARNING_RATE_DECAY_B = 16551 * 80
+__C.TRAIN.LEARNING_RATE_SCHEDULE = 'discexp'
+
+__C.NET = edict()
+
+# configuration for multibox_loss_layer
+__C.NET.MBLOSS = edict()
+__C.NET.MBLOSS.OVERLAP_THRESHOLD = 0.5
+__C.NET.MBLOSS.NEG_POS_RATIO = 3.0
+__C.NET.MBLOSS.NEG_OVERLAP = 0.5
+
+# configuration for detection_map
+__C.NET.DETMAP = edict()
+__C.NET.DETMAP.OVERLAP_THRESHOLD = 0.5
+__C.NET.DETMAP.EVAL_DIFFICULT = False
+__C.NET.DETMAP.AP_TYPE = "11point"
+
+# configuration for detection_output_layer
+__C.NET.DETOUT = edict()
+__C.NET.DETOUT.CONFIDENCE_THRESHOLD = 0.01
+__C.NET.DETOUT.NMS_THRESHOLD = 0.45
+__C.NET.DETOUT.NMS_TOP_K = 400
+__C.NET.DETOUT.KEEP_TOP_K = 200
+
+# configuration for priorbox_layer from conv4_3
+__C.NET.CONV4 = edict()
+__C.NET.CONV4.PB = edict()
+__C.NET.CONV4.PB.MIN_SIZE = [30]
+__C.NET.CONV4.PB.ASPECT_RATIO = [2.]
+__C.NET.CONV4.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
+
+# configuration for priorbox_layer from fc7
+__C.NET.FC7 = edict()
+__C.NET.FC7.PB = edict()
+__C.NET.FC7.PB.MIN_SIZE = [60]
+__C.NET.FC7.PB.MAX_SIZE = [114]
+__C.NET.FC7.PB.ASPECT_RATIO = [2., 3.]
+__C.NET.FC7.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
+
+# configuration for priorbox_layer from conv6_2
+__C.NET.CONV6 = edict()
+__C.NET.CONV6.PB = edict()
+__C.NET.CONV6.PB.MIN_SIZE = [114]
+__C.NET.CONV6.PB.MAX_SIZE = [168]
+__C.NET.CONV6.PB.ASPECT_RATIO = [2., 3.]
+__C.NET.CONV6.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
+
+# configuration for priorbox_layer from conv7_2
+__C.NET.CONV7 = edict()
+__C.NET.CONV7.PB = edict()
+__C.NET.CONV7.PB.MIN_SIZE = [168]
+__C.NET.CONV7.PB.MAX_SIZE = [222]
+__C.NET.CONV7.PB.ASPECT_RATIO = [2., 3.]
+__C.NET.CONV7.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
+
+# configuration for priorbox_layer from conv8_2
+__C.NET.CONV8 = edict()
+__C.NET.CONV8.PB = edict()
+__C.NET.CONV8.PB.MIN_SIZE = [222]
+__C.NET.CONV8.PB.MAX_SIZE = [276]
+__C.NET.CONV8.PB.ASPECT_RATIO = [2., 3.]
+__C.NET.CONV8.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
+
+# configuration for priorbox_layer from pool6
+__C.NET.POOL6 = edict()
+__C.NET.POOL6.PB = edict()
+__C.NET.POOL6.PB.MIN_SIZE = [276]
+__C.NET.POOL6.PB.MAX_SIZE = [330]
+__C.NET.POOL6.PB.ASPECT_RATIO = [2., 3.]
+__C.NET.POOL6.PB.VARIANCE = [0.1, 0.1, 0.2, 0.2]
diff --git a/ssd/data/label_list b/ssd/data/label_list
new file mode 100644
index 0000000000000000000000000000000000000000..87df23ce0aebcd5ab96fc91c868598c3333da59c
--- /dev/null
+++ b/ssd/data/label_list
@@ -0,0 +1,21 @@
+background
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
diff --git a/ssd/data/prepare_voc_data.py b/ssd/data/prepare_voc_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..a652956e91ab8277bc6670d4dc85905fc52a3203
--- /dev/null
+++ b/ssd/data/prepare_voc_data.py
@@ -0,0 +1,63 @@
+import os
+import os.path as osp
+import re
+import random
+
+devkit_dir = './VOCdevkit'
+years = ['2007', '2012']
+
+
+def get_dir(devkit_dir, year, type):
+    return osp.join(devkit_dir, 'VOC' + year, type)
+
+
+def walk_dir(devkit_dir, year):
+    filelist_dir = get_dir(devkit_dir, year, 'ImageSets/Main')
+    annotation_dir = get_dir(devkit_dir, year, 'Annotations')
+    img_dir = get_dir(devkit_dir, year, 'JPEGImages')
+    trainval_list = []
+    test_list = []
+    added = set()
+
+    for _, _, files in os.walk(filelist_dir):
+        for fname in files:
+            img_ann_list = []
+            if re.match('[a-z]+_trainval\.txt', fname):
+                img_ann_list = trainval_list
+            elif re.match('[a-z]+_test\.txt', fname):
+                img_ann_list = test_list
+            else:
+                continue
+            fpath = osp.join(filelist_dir, fname)
+            for line in open(fpath):
+                name_prefix = line.strip().split()[0]
+                if name_prefix in added:
+                    continue
+                added.add(name_prefix)
+                ann_path = osp.join(annotation_dir, name_prefix + '.xml')
+                img_path = osp.join(img_dir, name_prefix + '.jpg')
+                assert os.path.isfile(ann_path), 'file %s not found.' % ann_path
+                assert os.path.isfile(img_path), 'file %s not found.' % img_path
+                img_ann_list.append((img_path, ann_path))
+
+    return trainval_list, test_list
+
+
+def prepare_filelist(devkit_dir, years, output_dir):
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = walk_dir(devkit_dir, year)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as ftest:
+        for item in test_list:
+            ftest.write(item[0] + ' ' + item[1] + '\n')
+
+
+prepare_filelist(devkit_dir, years, '.')
diff --git a/ssd/data_provider.py b/ssd/data_provider.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59d324b497977ec02c1f728cb49a432f864382c
--- /dev/null
+++ b/ssd/data_provider.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import image_util
+from paddle.utils.image_util import *
+import random
+from PIL import Image
+import numpy as np
+import xml.etree.ElementTree
+import os
+
+
+class Settings(object):
+    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value):
+        self._data_dir = data_dir
+        self._label_list = []
+        label_fpath = os.path.join(data_dir, label_file)
+        for line in open(label_fpath):
+            self._label_list.append(line.strip())
+
+        self._resize_height = resize_h
+        self._resize_width = resize_w
+        self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
+            'float32')
+
+    @property
+    def data_dir(self):
+        return self._data_dir
+
+    @property
+    def label_list(self):
+        return self._label_list
+
+    @property
+    def resize_h(self):
+        return self._resize_height
+
+    @property
+    def resize_w(self):
+        return self._resize_width
+
+    @property
+    def img_mean(self):
+        return self._img_mean
+
+
+def _reader_creator(settings, file_list, mode, shuffle):
+    def reader():
+        with open(file_list) as flist:
+            lines = [line.strip() for line in flist]
+            if shuffle:
+                random.shuffle(lines)
+            for line in lines:
+                if mode == 'train' or mode == 'test':
+                    img_path, label_path = line.split()
+                    img_path = os.path.join(settings.data_dir, img_path)
+                    label_path = os.path.join(settings.data_dir, label_path)
+                elif mode == 'infer':
+                    img_path = os.path.join(settings.data_dir, line)
+
+                img = Image.open(img_path)
+                img_width, img_height = img.size
+                img = np.array(img)
+
+                # layout: label | xmin | ymin | xmax | ymax | difficult
+                if mode == 'train' or mode == 'test':
+                    bbox_labels = []
+                    root = xml.etree.ElementTree.parse(label_path).getroot()
+                    for object in root.findall('object'):
+                        bbox_sample = []
+                        # start from 1
+                        bbox_sample.append(
+                            float(
+                                settings.label_list.index(
+                                    object.find('name').text)))
+                        bbox = object.find('bndbox')
+                        difficult = float(object.find('difficult').text)
+                        bbox_sample.append(
+                            float(bbox.find('xmin').text) / img_width)
+                        bbox_sample.append(
+                            float(bbox.find('ymin').text) / img_height)
+                        bbox_sample.append(
+                            float(bbox.find('xmax').text) / img_width)
+                        bbox_sample.append(
+                            float(bbox.find('ymax').text) / img_height)
+                        bbox_sample.append(difficult)
+                        bbox_labels.append(bbox_sample)
+
+                    sample_labels = bbox_labels
+                    if mode == 'train':
+                        batch_sampler = []
+                        # hard-code here
+                        batch_sampler.append(
+                            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9,
+                                               0.0))
+                        batch_sampler.append(
+                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0,
+                                               1.0))
+                        """ random crop """
+                        sampled_bbox = image_util.generate_batch_samples(
+                            batch_sampler, bbox_labels, img_width, img_height)
+
+                        if len(sampled_bbox) > 0:
+                            idx = int(random.uniform(0, len(sampled_bbox)))
+                            img, sample_labels = image_util.crop_image(
+                                img, bbox_labels, sampled_bbox[idx], img_width,
+                                img_height)
+
+                img = Image.fromarray(img)
+                img = img.resize((settings.resize_w, settings.resize_h),
+                                 Image.ANTIALIAS)
+                img = np.array(img)
+
+                if mode == 'train':
+                    mirror = int(random.uniform(0, 2))
+                    if mirror == 1:
+                        img = img[:, ::-1, :]
+                        for i in xrange(len(sample_labels)):
+                            tmp = sample_labels[i][1]
+                            sample_labels[i][1] = 1 - sample_labels[i][3]
+                            sample_labels[i][3] = 1 - tmp
+
+                if len(img.shape) == 3:
+                    img = np.swapaxes(img, 1, 2)
+                    img = np.swapaxes(img, 1, 0)
+
+                img = img.astype('float32')
+                img -= settings.img_mean
+                img = img.flatten()
+
+                if mode == 'train' or mode == 'test':
+                    if mode == 'train' and len(sample_labels) == 0: continue
+                    yield img.astype('float32'), sample_labels
+                elif mode == 'infer':
+                    yield img.astype('float32')
+
+    return reader
+
+
+def train(settings, file_list, shuffle=True):
+    return _reader_creator(settings, file_list, 'train', shuffle)
+
+
+def test(settings, file_list):
+    return _reader_creator(settings, file_list, 'test', False)
+
+
+def infer(settings, file_list):
+    return _reader_creator(settings, file_list, 'infer', False)
diff --git a/ssd/eval.py b/ssd/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..345e46f98b098480877a54dac842bd576112b1a3
--- /dev/null
+++ b/ssd/eval.py
@@ -0,0 +1,48 @@
+import paddle.v2 as paddle
+import data_provider
+import vgg_ssd_net
+import os, sys
+import gzip
+from config.pascal_voc_conf import cfg
+
+
+def eval(eval_file_list, batch_size, data_args, model_path):
+    cost, detect_out = vgg_ssd_net.net_conf(mode='eval')
+
+    assert os.path.isfile(model_path), 'Invalid model.'
+    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
+
+    optimizer = paddle.optimizer.Momentum()
+
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        extra_layers=[detect_out],
+        update_equation=optimizer)
+
+    feeding = {'image': 0, 'bbox': 1}
+
+    reader = paddle.batch(
+        data_provider.test(data_args, eval_file_list), batch_size=batch_size)
+
+    result = trainer.test(reader=reader, feeding=feeding)
+
+    print "TestCost: %f, Detection mAP=%g" % \
+            (result.cost, result.metrics['detection_evaluator'])
+
+
+if __name__ == "__main__":
+    paddle.init(use_gpu=True, trainer_count=4)  # use 4 gpus
+
+    data_args = data_provider.Settings(
+        data_dir='./data',
+        label_file='label_list',
+        resize_h=cfg.IMG_HEIGHT,
+        resize_w=cfg.IMG_WIDTH,
+        mean_value=[104, 117, 124])
+
+    eval(
+        eval_file_list='./data/test.txt',
+        batch_size=4,
+        data_args=data_args,
+        model_path='models/pass-00000.tar.gz')
diff --git a/ssd/image_util.py b/ssd/image_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba8744eda0a078acd38cad9b10ca7511185efc43
--- /dev/null
+++ b/ssd/image_util.py
@@ -0,0 +1,161 @@
+from PIL import Image
+import numpy as np
+import random
+import math
+
+
+class sampler():
+    def __init__(self, max_sample, max_trial, min_scale, max_scale,
+                 min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap,
+                 max_jaccard_overlap):
+        self.max_sample = max_sample
+        self.max_trial = max_trial
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        self.min_jaccard_overlap = min_jaccard_overlap
+        self.max_jaccard_overlap = max_jaccard_overlap
+
+
+class bbox():
+    def __init__(self, xmin, ymin, xmax, ymax):
+        self.xmin = xmin
+        self.ymin = ymin
+        self.xmax = xmax
+        self.ymax = ymax
+
+
+def bbox_area(src_bbox):
+    width = src_bbox.xmax - src_bbox.xmin
+    height = src_bbox.ymax - src_bbox.ymin
+    return width * height
+
+
+def generate_sample(sampler):
+    scale = random.uniform(sampler.min_scale, sampler.max_scale)
+    min_aspect_ratio = max(sampler.min_aspect_ratio, (scale**2.0))
+    max_aspect_ratio = min(sampler.max_aspect_ratio, 1 / (scale**2.0))
+    aspect_ratio = random.uniform(min_aspect_ratio, max_aspect_ratio)
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = random.uniform(0, xmin_bound)
+    ymin = random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = bbox(xmin, ymin, xmax, ymax)
+    return sampled_bbox
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox.xmin >= object_bbox.xmax or \
+            sample_bbox.xmax <= object_bbox.xmin or \
+            sample_bbox.ymin >= object_bbox.ymax or \
+            sample_bbox.ymax <= object_bbox.ymin:
+        return 0
+    intersect_xmin = max(sample_bbox.xmin, object_bbox.xmin)
+    intersect_ymin = max(sample_bbox.ymin, object_bbox.ymin)
+    intersect_xmax = min(sample_bbox.xmax, object_bbox.xmax)
+    intersect_ymax = min(sample_bbox.ymax, object_bbox.ymax)
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+    if sampler.min_jaccard_overlap == 0 and sampler.max_jaccard_overlap == 0:
+        return True
+    for i in range(len(bbox_labels)):
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler.min_jaccard_overlap != 0 and \
+                overlap < sampler.min_jaccard_overlap:
+            continue
+        if sampler.max_jaccard_overlap != 0 and \
+                overlap > sampler.max_jaccard_overlap:
+            continue
+        return True
+    return False
+
+
+def generate_batch_samples(batch_sampler, bbox_labels, image_width,
+                           image_height):
+    sampled_bbox = []
+    index = []
+    c = 0
+    for sampler in batch_sampler:
+        found = 0
+        for i in range(sampler.max_trial):
+            if found >= sampler.max_sample:
+                break
+            sample_bbox = generate_sample(sampler)
+            if satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
+                sampled_bbox.append(sample_bbox)
+                found = found + 1
+                index.append(c)
+        c = c + 1
+    return sampled_bbox
+
+
+def clip_bbox(src_bbox):
+    src_bbox.xmin = max(min(src_bbox.xmin, 1.0), 0.0)
+    src_bbox.ymin = max(min(src_bbox.ymin, 1.0), 0.0)
+    src_bbox.xmax = max(min(src_bbox.xmax, 1.0), 0.0)
+    src_bbox.ymax = max(min(src_bbox.ymax, 1.0), 0.0)
+    return src_bbox
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox.xmax + src_bbox.xmin) / 2
+    center_y = (src_bbox.ymax + src_bbox.ymin) / 2
+    if center_x >= sample_bbox.xmin and \
+        center_x <= sample_bbox.xmax and \
+        center_y >= sample_bbox.ymin and \
+        center_y <= sample_bbox.ymax:
+        return True
+    return False
+
+
+def transform_labels(bbox_labels, sample_bbox):
+    proj_bbox = bbox(0, 0, 0, 0)
+    sample_labels = []
+    for i in range(len(bbox_labels)):
+        sample_label = []
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        if not meet_emit_constraint(object_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox.xmax - sample_bbox.xmin
+        sample_height = sample_bbox.ymax - sample_bbox.ymin
+        proj_bbox.xmin = (object_bbox.xmin - sample_bbox.xmin) / sample_width
+        proj_bbox.ymin = (object_bbox.ymin - sample_bbox.ymin) / sample_height
+        proj_bbox.xmax = (object_bbox.xmax - sample_bbox.xmin) / sample_width
+        proj_bbox.ymax = (object_bbox.ymax - sample_bbox.ymin) / sample_height
+        proj_bbox = clip_bbox(proj_bbox)
+        if bbox_area(proj_bbox) > 0:
+            sample_label.append(bbox_labels[i][0])
+            sample_label.append(float(proj_bbox.xmin))
+            sample_label.append(float(proj_bbox.ymin))
+            sample_label.append(float(proj_bbox.xmax))
+            sample_label.append(float(proj_bbox.ymax))
+            sample_label.append(bbox_labels[i][5])
+            sample_labels.append(sample_label)
+    return sample_labels
+
+
+def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
+    sample_bbox = clip_bbox(sample_bbox)
+    xmin = int(sample_bbox.xmin * image_width)
+    xmax = int(sample_bbox.xmax * image_width)
+    ymin = int(sample_bbox.ymin * image_height)
+    ymax = int(sample_bbox.ymax * image_height)
+    sample_img = img[ymin:ymax, xmin:xmax]
+    sample_labels = transform_labels(bbox_labels, sample_bbox)
+    return sample_img, sample_labels
diff --git a/ssd/images/SSD300x300_map.png b/ssd/images/SSD300x300_map.png
new file mode 100644
index 0000000000000000000000000000000000000000..a40a1e028be7ba979052034c152028976bc4b715
Binary files /dev/null and b/ssd/images/SSD300x300_map.png differ
diff --git a/ssd/images/ssd_network.png b/ssd/images/ssd_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..193caa0168a4f981506ad7b97f8b9fb35557ed20
Binary files /dev/null and b/ssd/images/ssd_network.png differ
diff --git a/ssd/images/vis_1.jpg b/ssd/images/vis_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..c317462ee6053df15fa8d44d0f35398e47156e8d
Binary files /dev/null and b/ssd/images/vis_1.jpg differ
diff --git a/ssd/images/vis_2.jpg b/ssd/images/vis_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..7bc59b239cb9c123087fdecbb210ad52a3a35f10
Binary files /dev/null and b/ssd/images/vis_2.jpg differ
diff --git a/ssd/images/vis_3.jpg b/ssd/images/vis_3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a79598343a7e2707ba79c2e8891d7af0c24df491
Binary files /dev/null and b/ssd/images/vis_3.jpg differ
diff --git a/ssd/images/vis_4.jpg b/ssd/images/vis_4.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..96b2c99c9ef986cc0d4802b31c33f076fce6f965
Binary files /dev/null and b/ssd/images/vis_4.jpg differ
diff --git a/ssd/index.html b/ssd/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..c31c21889c7309b449940edcc0323b50d453efd0
--- /dev/null
+++ b/ssd/index.html
@@ -0,0 +1,290 @@
+
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+
+
+<body>
+
+<div id="context" class="container-fluid markdown-body">
+</div>
+
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# SSD目标检测
+## 概述
+SSD全称为Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具体参见论文\[[1](#引用)\]。SSD算法主要特点是检测速度快且检测精度高。PaddlePaddle已集成SSD算法，本示例旨在介绍如何使用PaddlePaddle中的SSD模型进行目标检测。下文展开顺序为：首先简要介绍SSD原理，然后介绍示例包含文件及作用，接着介绍如何在PASCAL VOC数据集上训练、评估及检测，最后简要介绍如何在自有数据集上使用SSD。
+## SSD原理
+SSD使用一个卷积神经网络实现“端到端”的检测，所谓“端到端”指输入为原始图像，输出为检测结果，无需借助外部工具或流程进行特征提取、候选框生成等。论文中SSD的基础模型为VGG16\[[2](#引用)\]，不同于原始VGG16网络模型，SSD做了一些改变：
+
+1. 将最后的fc6、fc7全连接层变为卷积层，卷积层参数通过对原始fc6、fc7参数采样得到。
+2. 将pool5层的参数由2x2-s2（kernel大小为2x2，stride size为2）更改为3x3-s1-p1（kernel大小为3x3，stride size为1，padding size为1）。
+3. 在conv4\_3、conv7、conv8\_2、conv9\_2、conv10\_2及pool11层后面接了priorbox层，priorbox层的主要目的是根据输入的特征图（feature map）生成一系列的矩形候选框。关于SSD的更详细的介绍可以参考论文\[[1](#引用)\]。
+
+下图为模型（300x300）的总体结构：
+
+<p align="center">
+<img src="images/ssd_network.png" width="900" height="250" hspace='10'/> <br/>
+图1. SSD网络结构
+</p>
+
+图中每个矩形盒子代表一个卷积层，最后的两个矩形框分别表示汇总各卷积层输出结果和后处理阶段。具体地，在预测阶段网络会输出一组候选矩形框，每个矩形包含两类信息：位置和类别得分，图中倒数第二个矩形框即表示网络的检测结果的汇总处理，由于候选矩形框数量较多且很多矩形框重叠严重，这时需要经过后处理来筛选出质量较高的少数矩形框，这里的后处理主要指非极大值抑制（Non-maximum Suppression）。
+
+从SSD的网络结构可以看出，候选矩形框在多个特征图（feature map上）生成，不同的feature map具有的感受野不同，这样可以在不同尺度扫描图像，相对于其他检测方法可以生成更丰富的候选框，从而提高检测精度；另一方面SSD对VGG16的扩展部分以较小的代价实现对候选框的位置和类别得分的计算，整个过程只需要一个卷积神经网络完成，所以速度较快。
+
+## 示例总览
+本示例共包含如下文件：
+
+<table>
+<caption>表1. 示例文件</caption>
+<tr><th>文件</th><th>用途</th></tr>
+<tr><td>train.py</td><td>训练脚本</td></tr>
+<tr><td>eval.py</td><td>评估脚本，用于评估训好模型</td></tr>
+<tr><td>infer.py</td><td>检测脚本，给定图片及模型，实施检测</td></tr>
+<tr><td>visual.py</td><td>检测结果可视化</td></tr>
+<tr><td>image_util.py</td><td>图像预处理所需公共函数</td></tr>
+<tr><td>data_provider.py</td><td>数据处理脚本，生成训练、评估或检测所需数据</td></tr>
+<tr><td>config/pascal_voc_conf.py</td><td>神经网络超参数配置文件</td></tr>
+<tr><td>data/label_list</td><td>类别列表</td></tr>
+<tr><td>data/prepare_voc_data.py</td><td>准备训练PASCAL VOC数据列表</td></tr>
+</table>
+
+训练阶段需要对数据做预处理，包括裁剪、采样等，这部分操作在```image_util.py```和```data_provider.py```中完成。值得注意的是，```config/vgg_config.py```为参数配置文件，包括训练参数、神经网络参数等，本配置文件包含参数是针对PASCAL VOC数据配置的，当训练自有数据时，需要仿照该文件配置新的参数。```data/prepare_voc_data.py```脚本用来生成文件列表，包括切分训练集和测试集，使用时需要用户事先下载并解压数据，默认采用VOC2007和VOC2012。
+
+## PASCAL VOC数据集
+### 数据准备
+首先需要下载数据集，VOC2007\[[3](#引用)\]和VOC2012\[[4](#引用)\]，VOC2007包含训练集和测试集，VOC2012只包含训练集，将下载好的数据解压，目录结构为```data/VOCdevkit/VOC2007```和```data/VOCdevkit/VOC2012```。进入```data```目录，运行```python prepare_voc_data.py```即可生成```trainval.txt```和```test.txt```。核心函数为：
+
+```python
+def prepare_filelist(devkit_dir, years, output_dir):
+    trainval_list = []
+    test_list = []
+    for year in years:
+        trainval, test = walk_dir(devkit_dir, year)
+        trainval_list.extend(trainval)
+        test_list.extend(test)
+    random.shuffle(trainval_list)
+    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
+        for item in trainval_list:
+            ftrainval.write(item[0] + ' ' + item[1] + '\n')
+
+    with open(osp.join(output_dir, 'test.txt'), 'w') as ftest:
+        for item in test_list:
+            ftest.write(item[0] + ' ' + item[1] + '\n')
+```
+
+该函数首先对每一年（year）的数据进行处理，然后将训练图像的文件路径列表进行随机打乱，最后保存训练文件列表和测试文件列表。默认```prepare_voc_data.py```和```VOCdevkit```在相同目录下，且生成的文件列表也在该目录。需注意```trainval.txt```既包含VOC2007的训练数据，也包含VOC2012的训练数据，```test.txt```只包含VOC2007的测试数据。我们这里提供```trainval.txt```前几行输入作为样例：
+
+```
+VOCdevkit/VOC2007/JPEGImages/000005.jpg VOCdevkit/VOC2007/Annotations/000005.xml
+VOCdevkit/VOC2007/JPEGImages/000007.jpg VOCdevkit/VOC2007/Annotations/000007.xml
+VOCdevkit/VOC2007/JPEGImages/000009.jpg VOCdevkit/VOC2007/Annotations/000009.xml
+```
+
+文件共两个字段，第一个字段为图像文件的相对路径，第二个字段为对应标注文件的相对路径。
+
+### 预训练模型准备
+下载预训练的VGG-16模型，我们提供了一个转换好的模型，具体下载地址为：http://paddlepaddle.bj.bcebos.com/model_zoo/detection/ssd_model/vgg_model.tar.gz ，下载好模型后，放置路径为```vgg/vgg_model.tar.gz```。
+### 模型训练
+直接执行```python train.py```即可进行训练。需要注意本示例仅支持CUDA GPU环境，无法在CPU上训练，主要因为使用CPU训练速度很慢，实践中一般使用GPU来处理图像任务，这里实现采用硬编码方式使用cuDNN，不提供CPU版本。```train.py```的一些关键执行逻辑：
+
+```python
+paddle.init(use_gpu=True, trainer_count=4)
+data_args = data_provider.Settings(
+                data_dir='./data',
+                label_file='label_list',
+                resize_h=cfg.IMG_HEIGHT,
+                resize_w=cfg.IMG_WIDTH,
+                mean_value=[104,117,124])
+train(train_file_list='./data/trainval.txt',
+      dev_file_list='./data/test.txt',
+      data_args=data_args,
+      init_model_path='./vgg/vgg_model.tar.gz')
+```
+
+主要包括：
+
+1. 调用```paddle.init```指定使用4卡GPU训练。
+2. 调用```data_provider.Settings```配置数据预处理所需参数，其中```cfg.IMG_HEIGHT```和```cfg.IMG_WIDTH```在配置文件```config/vgg_config.py```中设置，这里均为300，300x300是一个典型配置，兼顾效率和检测精度，也可以通过修改配置文件扩展到512x512。
+3. 调用```train```执行训练，其中```train_file_list```指定训练数据列表，```dev_file_list```指定评估数据列表，```init_model_path```指定预训练模型位置。
+4. 训练过程中会打印一些日志信息，每训练1个batch会输出当前的轮数、当前batch的cost及mAP（mean Average Precision，平均精度均值），每训练一个pass，会保存一次模型，默认保存在```checkpoints```目录下（注：需事先创建）。
+
+下面给出SDD300x300在VOC数据集（train包括07+12，test为07）上的mAP曲线，迭代140轮mAP可达到71.52%。
+
+<p align="center">
+<img src="images/SSD300x300_map.png" hspace='10'/> <br/>
+图2. SSD300x300 mAP收敛曲线
+</p>
+
+
+### 模型评估
+执行```python eval.py```即可对模型进行评估，```eval.py```的关键执行逻辑如下：
+
+```python
+paddle.init(use_gpu=True, trainer_count=4)  # use 4 gpus
+
+data_args = data_provider.Settings(
+    data_dir='./data',
+    label_file='label_list',
+    resize_h=cfg.IMG_HEIGHT,
+    resize_w=cfg.IMG_WIDTH,
+    mean_value=[104, 117, 124])
+
+eval(
+    eval_file_list='./data/test.txt',
+    batch_size=4,
+    data_args=data_args,
+    model_path='models/pass-00000.tar.gz')
+```
+
+调用```paddle.init```指定使用4卡GPU评估；```data_provider.Settings```参见训练阶段的配置；调用```eval```执行评估，其中```eval_file_list```指定评估数据列表，```batch_size```指定评估时batch size的大小，```model_path ```指定模型位置。评估结束会输出```loss```信息和```mAP```信息。
+
+### 图像检测
+执行```python infer.py```即可使用训练好的模型对图片实施检测，```infer.py```关键逻辑如下：
+
+```python
+infer(
+    eval_file_list='./data/infer.txt',
+    save_path='infer.res',
+    data_args=data_args,
+    batch_size=4,
+    model_path='models/pass-00000.tar.gz',
+    threshold=0.3)
+```
+
+其中```eval_file_list```指定图像路径列表；```save_path```指定预测结果保存路径；```data_args```如上；```batch_size```为每多少样本预测一次；```model_path```指模型的位置；```threshold```为置信度阈值，只有得分大于或等于该值的才会输出。下面给出```infer.res```的一些输出样例：
+
+```
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 12 0.997844 131.255611777 162.271582842 396.475315094 334.0
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.998557 229.160234332 49.5991278887 314.098775387 312.913876176
+VOCdevkit/VOC2007/JPEGImages/006936.jpg 14 0.372522 187.543615699 133.727034628 345.647156239 327.448492289
+...
+```
+
+一共包含4个字段，以tab分割，第一个字段是检测图像路径，第二字段为检测矩形框内类别，第三个字段是置信度，第四个字段是4个坐标值（以空格分割）。
+
+示例还提供了一个可视化脚本，直接运行```python visual.py```即可，须指定输出检测结果路径及输出目录，默认可视化后图像保存在```./visual_res```，下面是用训练好的模型infer部分图像并可视化的效果：
+
+<p align="center">
+<img src="images/vis_1.jpg" height=150 width=200 hspace='10'/>
+<img src="images/vis_2.jpg" height=150 width=200 hspace='10'/>
+<img src="images/vis_3.jpg" height=150 width=100 hspace='10'/>
+<img src="images/vis_4.jpg" height=150 width=200 hspace='10'/> <br />
+图3. SSD300x300 检测可视化示例
+</p>
+
+
+## 自有数据集
+在自有数据上训练PaddlePaddle SSD需要完成两个关键准备，首先需要适配网络可以接受的输入格式，这里提供一个推荐的结构，以```train.txt```为例
+
+```
+image00000_file_path    image00000_annotation_file_path
+image00001_file_path    image00001_annotation_file_path
+image00002_file_path    image00002_annotation_file_path
+...
+```
+
+文件共两列，以空白符分割，第一列为图像文件的路径，第二列为对应标注数据的文件路径。对图像文件的读取比较直接，略微复杂的是对标注数据的解析，本示例中标注数据使用xml文件存储，所以需要在```data_provider.py```中对xml解析，核心逻辑如下：
+
+```python
+bbox_labels = []
+root = xml.etree.ElementTree.parse(label_path).getroot()
+for object in root.findall('object'):
+    bbox_sample = []
+    # start from 1
+    bbox_sample.append(float(settings.label_list.index(
+         object.find('name').text)))
+    bbox = object.find('bndbox')
+    difficult = float(object.find('difficult').text)
+    bbox_sample.append(float(bbox.find('xmin').text)/img_width)
+    bbox_sample.append(float(bbox.find('ymin').text)/img_height)
+    bbox_sample.append(float(bbox.find('xmax').text)/img_width)
+    bbox_sample.append(float(bbox.find('ymax').text)/img_height)
+    bbox_sample.append(difficult)
+    bbox_labels.append(bbox_sample)
+```
+
+这里一条标注数据包括：label、xmin、ymin、xmax、ymax和is\_difficult，is\_difficult表示该object是否为难例，实际中如果不需要，只需把该字段置零即可。自有数据也需要提供对应的解析逻辑，假设标注数据（比如image00000\_annotation\_file\_path）存储格式如下：
+
+```
+label1 xmin1 ymin1 xmax1 ymax1
+label2 xmin2 ymin2 xmax2 ymax2
+...
+```
+
+每行对应一个物体，共5个字段，第一个为label（注背景为0，需从1编号），剩余4个为坐标，对应的解析逻辑可更改为如下：
+
+```
+bbox_labels = []
+with open(label_path) as flabel:
+    for line in flabel:
+        bbox_sample = []
+        bbox = [float(i) for i in line.strip().split()]
+        label = bbox[0]
+        bbox_sample.append(label)
+        bbox_sample.append(bbox[1]/float(img_width))
+        bbox_sample.append(bbox[2]/float(img_height))
+        bbox_sample.append(bbox[3]/float(img_width))
+        bbox_sample.append(bbox[4]/float(img_height))
+        bbox_sample.append(0.0)
+        bbox_labels.append(bbox_sample)
+```
+
+另一个重要的事情就是根据图像大小及检测物体的大小等更改网络结构的配置，主要是仿照```config/vgg_config.py```创建自己的配置文件，参数设置经验请参照论文\[[1](#引用)\]。
+
+## 引用
+1. Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed, Cheng-Yang Fu, Alexander C. Berg. [SSD: Single shot multibox detector](https://arxiv.org/abs/1512.02325). European conference on computer vision. Springer, Cham, 2016.
+2. Simonyan, Karen, and Andrew Zisserman. [Very deep convolutional networks for large-scale image recognition](https://arxiv.org/abs/1409.1556). arXiv preprint arXiv:1409.1556 (2014).
+3. [The PASCAL Visual Object Classes Challenge 2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html)
+4. [Visual Object Classes Challenge 2012 (VOC2012)](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html)
+
+</div>
+<!-- You can change the lines below now. -->
+
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
diff --git a/ssd/infer.py b/ssd/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bc79189935d8bdd59f17756b9c95581870f36a
--- /dev/null
+++ b/ssd/infer.py
@@ -0,0 +1,98 @@
+import paddle.v2 as paddle
+import data_provider
+import vgg_ssd_net
+import os, sys
+import numpy as np
+import gzip
+from PIL import Image
+from config.pascal_voc_conf import cfg
+
+
+def _infer(inferer, infer_data, threshold):
+    ret = []
+    infer_res = inferer.infer(input=infer_data)
+    keep_inds = np.where(infer_res[:, 2] >= threshold)[0]
+    for idx in keep_inds:
+        ret.append([
+            infer_res[idx][0], infer_res[idx][1] - 1, infer_res[idx][2],
+            infer_res[idx][3], infer_res[idx][4], infer_res[idx][5],
+            infer_res[idx][6]
+        ])
+    return ret
+
+
+def save_batch_res(ret_res, img_w, img_h, fname_list, fout):
+    for det_res in ret_res:
+        img_idx = int(det_res[0])
+        label = int(det_res[1])
+        conf_score = det_res[2]
+        xmin = det_res[3] * img_w[img_idx]
+        ymin = det_res[4] * img_h[img_idx]
+        xmax = det_res[5] * img_w[img_idx]
+        ymax = det_res[6] * img_h[img_idx]
+        fout.write(fname_list[img_idx] + '\t' + str(label) + '\t' + str(
+            conf_score) + '\t' + str(xmin) + ' ' + str(ymin) + ' ' + str(xmax) +
+                   ' ' + str(ymax))
+        fout.write('\n')
+
+
+def infer(eval_file_list, save_path, data_args, batch_size, model_path,
+          threshold):
+    detect_out = vgg_ssd_net.net_conf(mode='infer')
+
+    assert os.path.isfile(model_path), 'Invalid model.'
+    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
+
+    inferer = paddle.inference.Inference(
+        output_layer=detect_out, parameters=parameters)
+
+    reader = data_provider.infer(data_args, eval_file_list)
+    all_fname_list = [line.strip() for line in open(eval_file_list).readlines()]
+
+    test_data = []
+    fname_list = []
+    img_w = []
+    img_h = []
+    idx = 0
+    """Do inference batch by batch,
+    coords of bbox will be scaled based on image size
+    """
+    with open(save_path, 'w') as fout:
+        for img in reader():
+            test_data.append([img])
+            fname_list.append(all_fname_list[idx])
+            w, h = Image.open(os.path.join('./data', fname_list[-1])).size
+            img_w.append(w)
+            img_h.append(h)
+            if len(test_data) == batch_size:
+                ret_res = _infer(inferer, test_data, threshold)
+                save_batch_res(ret_res, img_w, img_h, fname_list, fout)
+                test_data = []
+                fname_list = []
+                img_w = []
+                img_h = []
+
+            idx += 1
+
+        if len(test_data) > 0:
+            ret_res = _infer(inferer, test_data, threshold)
+            save_batch_res(ret_res, img_w, img_h, fname_list, fout)
+
+
+if __name__ == "__main__":
+    paddle.init(use_gpu=True, trainer_count=1)
+
+    data_args = data_provider.Settings(
+        data_dir='./data',
+        label_file='label_list',
+        resize_h=cfg.IMG_HEIGHT,
+        resize_w=cfg.IMG_WIDTH,
+        mean_value=[104, 117, 124])
+
+    infer(
+        eval_file_list='./data/infer.txt',
+        save_path='infer.res',
+        data_args=data_args,
+        batch_size=4,
+        model_path='models/pass-00000.tar.gz',
+        threshold=0.3)
diff --git a/ssd/train.py b/ssd/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..783944214b67d15af31267c8ba1ded3fa48e6cb0
--- /dev/null
+++ b/ssd/train.py
@@ -0,0 +1,84 @@
+import paddle.v2 as paddle
+import data_provider
+import vgg_ssd_net
+import os, sys
+import gzip
+import tarfile
+from config.pascal_voc_conf import cfg
+
+
+def train(train_file_list, dev_file_list, data_args, init_model_path):
+    optimizer = paddle.optimizer.Momentum(
+        momentum=cfg.TRAIN.MOMENTUM,
+        learning_rate=cfg.TRAIN.LEARNING_RATE,
+        regularization=paddle.optimizer.L2Regularization(
+            rate=cfg.TRAIN.L2REGULARIZATION),
+        learning_rate_decay_a=cfg.TRAIN.LEARNING_RATE_DECAY_A,
+        learning_rate_decay_b=cfg.TRAIN.LEARNING_RATE_DECAY_B,
+        learning_rate_schedule=cfg.TRAIN.LEARNING_RATE_SCHEDULE)
+
+    cost, detect_out = vgg_ssd_net.net_conf('train')
+
+    parameters = paddle.parameters.create(cost)
+    if not (init_model_path is None):
+        assert os.path.isfile(init_model_path), 'Invalid model.'
+        parameters.init_from_tar(gzip.open(init_model_path))
+
+    trainer = paddle.trainer.SGD(
+        cost=cost,
+        parameters=parameters,
+        extra_layers=[detect_out],
+        update_equation=optimizer)
+
+    feeding = {'image': 0, 'bbox': 1}
+
+    train_reader = paddle.batch(
+        data_provider.train(data_args, train_file_list),
+        batch_size=cfg.TRAIN.BATCH_SIZE)  # generate a batch image each time
+
+    dev_reader = paddle.batch(
+        data_provider.test(data_args, dev_file_list),
+        batch_size=cfg.TRAIN.BATCH_SIZE)
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, TrainCost %f, Detection mAP=%f" % \
+                        (event.pass_id,
+                         event.batch_id,
+                         event.cost,
+                         event.metrics['detection_evaluator'])
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+
+        if isinstance(event, paddle.event.EndPass):
+            with gzip.open('checkpoints/params_pass_%05d.tar.gz' % \
+                    event.pass_id, 'w') as f:
+                parameters.to_tar(f)
+            result = trainer.test(reader=dev_reader, feeding=feeding)
+            print "\nTest with Pass %d, TestCost: %f, Detection mAP=%g" % \
+                    (event.pass_id,
+                     result.cost,
+                     result.metrics['detection_evaluator'])
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=cfg.TRAIN.NUM_PASS,
+        feeding=feeding)
+
+
+if __name__ == "__main__":
+    paddle.init(use_gpu=True, trainer_count=4)
+    data_args = data_provider.Settings(
+        data_dir='./data',
+        label_file='label_list',
+        resize_h=cfg.IMG_HEIGHT,
+        resize_w=cfg.IMG_WIDTH,
+        mean_value=[104, 117, 124])
+    train(
+        train_file_list='./data/trainval.txt',
+        dev_file_list='./data/test.txt',
+        data_args=data_args,
+        init_model_path='./vgg/vgg_model.tar.gz')
diff --git a/ssd/vgg_ssd_net.py b/ssd/vgg_ssd_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5c107e6fda6e58ff2b27c55bd4773639d36aab
--- /dev/null
+++ b/ssd/vgg_ssd_net.py
@@ -0,0 +1,278 @@
+import paddle.v2 as paddle
+from config.pascal_voc_conf import cfg
+
+
+def net_conf(mode):
+    """Network configuration. Total three modes included 'train' 'eval'
+    and 'infer'. Loss and mAP evaluation layer will return if using 'train'
+    and 'eval'. In 'infer' mode, only detection output layer will be returned.
+    """
+    default_l2regularization = cfg.TRAIN.L2REGULARIZATION
+
+    default_bias_attr = paddle.attr.ParamAttr(l2_rate=0.0, learning_rate=2.0)
+    default_static_bias_attr = paddle.attr.ParamAttr(is_static=True)
+
+    def get_param_attr(local_lr, regularization):
+        is_static = False
+        if local_lr == 0.0:
+            is_static = True
+        return paddle.attr.ParamAttr(
+            learning_rate=local_lr, l2_rate=regularization, is_static=is_static)
+
+    def conv_group(stack_num, name_list, input, filter_size_list, num_channels,
+                   num_filters_list, stride_list, padding_list,
+                   common_bias_attr, common_param_attr, common_act):
+        conv = input
+        in_channels = num_channels
+        for i in xrange(stack_num):
+            conv = paddle.layer.img_conv(
+                name=name_list[i],
+                input=conv,
+                filter_size=filter_size_list[i],
+                num_channels=in_channels,
+                num_filters=num_filters_list[i],
+                stride=stride_list[i],
+                padding=padding_list[i],
+                bias_attr=common_bias_attr,
+                param_attr=common_param_attr,
+                act=common_act)
+            in_channels = num_filters_list[i]
+        return conv
+
+    def vgg_block(idx_str, input, num_channels, num_filters, pool_size,
+                  pool_stride, pool_pad):
+        layer_name = "conv%s_" % idx_str
+        stack_num = 3
+        name_list = [layer_name + str(i + 1) for i in xrange(3)]
+
+        conv = conv_group(stack_num, name_list, input, [3] * stack_num,
+                          num_channels, [num_filters] * stack_num,
+                          [1] * stack_num, [1] * stack_num, default_bias_attr,
+                          get_param_attr(1, default_l2regularization),
+                          paddle.activation.Relu())
+
+        pool = paddle.layer.img_pool(
+            input=conv,
+            pool_size=pool_size,
+            num_channels=num_filters,
+            pool_type=paddle.pooling.CudnnMax(),
+            stride=pool_stride,
+            padding=pool_pad)
+        return conv, pool
+
+    def mbox_block(layer_idx, input, num_channels, filter_size, loc_filters,
+                   conf_filters):
+        mbox_loc_name = layer_idx + "_mbox_loc"
+        mbox_loc = paddle.layer.img_conv(
+            name=mbox_loc_name,
+            input=input,
+            filter_size=filter_size,
+            num_channels=num_channels,
+            num_filters=loc_filters,
+            stride=1,
+            padding=1,
+            bias_attr=default_bias_attr,
+            param_attr=get_param_attr(1, default_l2regularization),
+            act=paddle.activation.Identity())
+
+        mbox_conf_name = layer_idx + "_mbox_conf"
+        mbox_conf = paddle.layer.img_conv(
+            name=mbox_conf_name,
+            input=input,
+            filter_size=filter_size,
+            num_channels=num_channels,
+            num_filters=conf_filters,
+            stride=1,
+            padding=1,
+            bias_attr=default_bias_attr,
+            param_attr=get_param_attr(1, default_l2regularization),
+            act=paddle.activation.Identity())
+
+        return mbox_loc, mbox_conf
+
+    def ssd_block(layer_idx, input, img_shape, num_channels, num_filters1,
+                  num_filters2, aspect_ratio, variance, min_size, max_size):
+        layer_name = "conv" + layer_idx + "_"
+        stack_num = 2
+        conv1_name = layer_name + "1"
+        conv2_name = layer_name + "2"
+        conv2 = conv_group(stack_num, [conv1_name, conv2_name], input, [1, 3],
+                           num_channels, [num_filters1, num_filters2], [1, 2],
+                           [0, 1], default_bias_attr,
+                           get_param_attr(1, default_l2regularization),
+                           paddle.activation.Relu())
+
+        loc_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
+        conf_filters = (
+            len(aspect_ratio) * 2 + 1 + len(max_size)) * cfg.CLASS_NUM
+        mbox_loc, mbox_conf = mbox_block(conv2_name, conv2, num_filters2, 3,
+                                         loc_filters, conf_filters)
+        mbox_priorbox = paddle.layer.priorbox(
+            input=conv2,
+            image=img_shape,
+            min_size=min_size,
+            max_size=max_size,
+            aspect_ratio=aspect_ratio,
+            variance=variance)
+
+        return conv2, mbox_loc, mbox_conf, mbox_priorbox
+
+    img = paddle.layer.data(
+        name='image',
+        type=paddle.data_type.dense_vector(cfg.IMG_CHANNEL * cfg.IMG_HEIGHT *
+                                           cfg.IMG_WIDTH),
+        height=cfg.IMG_HEIGHT,
+        width=cfg.IMG_WIDTH)
+
+    stack_num = 2
+    conv1_2 = conv_group(stack_num, ['conv1_1', 'conv1_2'], img,
+                         [3] * stack_num, 3, [64] * stack_num, [1] * stack_num,
+                         [1] * stack_num, default_static_bias_attr,
+                         get_param_attr(0, 0), paddle.activation.Relu())
+
+    pool1 = paddle.layer.img_pool(
+        name="pool1",
+        input=conv1_2,
+        pool_type=paddle.pooling.CudnnMax(),
+        pool_size=2,
+        num_channels=64,
+        stride=2)
+
+    stack_num = 2
+    conv2_2 = conv_group(stack_num, ['conv2_1', 'conv2_2'], pool1, [3] *
+                         stack_num, 64, [128] * stack_num, [1] * stack_num,
+                         [1] * stack_num, default_static_bias_attr,
+                         get_param_attr(0, 0), paddle.activation.Relu())
+
+    pool2 = paddle.layer.img_pool(
+        name="pool2",
+        input=conv2_2,
+        pool_type=paddle.pooling.CudnnMax(),
+        pool_size=2,
+        num_channels=128,
+        stride=2)
+
+    conv3_3, pool3 = vgg_block("3", pool2, 128, 256, 2, 2, 0)
+
+    conv4_3, pool4 = vgg_block("4", pool3, 256, 512, 2, 2, 0)
+    conv4_3_mbox_priorbox = paddle.layer.priorbox(
+        input=conv4_3,
+        image=img,
+        min_size=cfg.NET.CONV4.PB.MIN_SIZE,
+        aspect_ratio=cfg.NET.CONV4.PB.ASPECT_RATIO,
+        variance=cfg.NET.CONV4.PB.VARIANCE)
+    conv4_3_norm = paddle.layer.cross_channel_norm(
+        name="conv4_3_norm",
+        input=conv4_3,
+        param_attr=paddle.attr.ParamAttr(
+            initial_mean=20, initial_std=0, is_static=False, learning_rate=1))
+    conv4_3_norm_mbox_loc, conv4_3_norm_mbox_conf = \
+            mbox_block("conv4_3_norm", conv4_3_norm, 512, 3, 12, 63)
+
+    conv5_3, pool5 = vgg_block("5", pool4, 512, 512, 3, 1, 1)
+
+    stack_num = 2
+    fc7 = conv_group(stack_num, ['fc6', 'fc7'], pool5, [3, 1], 512, [1024] *
+                     stack_num, [1] * stack_num, [1, 0], default_bias_attr,
+                     get_param_attr(1, default_l2regularization),
+                     paddle.activation.Relu())
+
+    fc7_mbox_loc, fc7_mbox_conf = mbox_block("fc7", fc7, 1024, 3, 24, 126)
+    fc7_mbox_priorbox = paddle.layer.priorbox(
+        input=fc7,
+        image=img,
+        min_size=cfg.NET.FC7.PB.MIN_SIZE,
+        max_size=cfg.NET.FC7.PB.MAX_SIZE,
+        aspect_ratio=cfg.NET.FC7.PB.ASPECT_RATIO,
+        variance=cfg.NET.FC7.PB.VARIANCE)
+
+    conv6_2, conv6_2_mbox_loc, conv6_2_mbox_conf, conv6_2_mbox_priorbox = \
+            ssd_block("6", fc7, img, 1024, 256, 512,
+                    cfg.NET.CONV6.PB.ASPECT_RATIO,
+                    cfg.NET.CONV6.PB.VARIANCE,
+                    cfg.NET.CONV6.PB.MIN_SIZE,
+                    cfg.NET.CONV6.PB.MAX_SIZE)
+    conv7_2, conv7_2_mbox_loc, conv7_2_mbox_conf, conv7_2_mbox_priorbox = \
+            ssd_block("7", conv6_2, img, 512, 128, 256,
+                    cfg.NET.CONV7.PB.ASPECT_RATIO,
+                    cfg.NET.CONV7.PB.VARIANCE,
+                    cfg.NET.CONV7.PB.MIN_SIZE,
+                    cfg.NET.CONV7.PB.MAX_SIZE)
+    conv8_2, conv8_2_mbox_loc, conv8_2_mbox_conf, conv8_2_mbox_priorbox = \
+            ssd_block("8", conv7_2, img, 256, 128, 256,
+                    cfg.NET.CONV8.PB.ASPECT_RATIO,
+                    cfg.NET.CONV8.PB.VARIANCE,
+                    cfg.NET.CONV8.PB.MIN_SIZE,
+                    cfg.NET.CONV8.PB.MAX_SIZE)
+
+    pool6 = paddle.layer.img_pool(
+        name="pool6",
+        input=conv8_2,
+        pool_size=3,
+        num_channels=256,
+        stride=1,
+        pool_type=paddle.pooling.Avg())
+    pool6_mbox_loc, pool6_mbox_conf = mbox_block("pool6", pool6, 256, 3, 24,
+                                                 126)
+    pool6_mbox_priorbox = paddle.layer.priorbox(
+        input=pool6,
+        image=img,
+        min_size=cfg.NET.POOL6.PB.MIN_SIZE,
+        max_size=cfg.NET.POOL6.PB.MAX_SIZE,
+        aspect_ratio=cfg.NET.POOL6.PB.ASPECT_RATIO,
+        variance=cfg.NET.POOL6.PB.VARIANCE)
+
+    mbox_priorbox = paddle.layer.concat(
+        name="mbox_priorbox",
+        input=[
+            conv4_3_mbox_priorbox, fc7_mbox_priorbox, conv6_2_mbox_priorbox,
+            conv7_2_mbox_priorbox, conv8_2_mbox_priorbox, pool6_mbox_priorbox
+        ])
+
+    loc_loss_input = [
+        conv4_3_norm_mbox_loc, fc7_mbox_loc, conv6_2_mbox_loc, conv7_2_mbox_loc,
+        conv8_2_mbox_loc, pool6_mbox_loc
+    ]
+
+    conf_loss_input = [
+        conv4_3_norm_mbox_conf, fc7_mbox_conf, conv6_2_mbox_conf,
+        conv7_2_mbox_conf, conv8_2_mbox_conf, pool6_mbox_conf
+    ]
+
+    detection_out = paddle.layer.detection_output(
+        input_loc=loc_loss_input,
+        input_conf=conf_loss_input,
+        priorbox=mbox_priorbox,
+        confidence_threshold=cfg.NET.DETOUT.CONFIDENCE_THRESHOLD,
+        nms_threshold=cfg.NET.DETOUT.NMS_THRESHOLD,
+        num_classes=cfg.CLASS_NUM,
+        nms_top_k=cfg.NET.DETOUT.NMS_TOP_K,
+        keep_top_k=cfg.NET.DETOUT.KEEP_TOP_K,
+        background_id=cfg.BACKGROUND_ID,
+        name="detection_output")
+
+    if mode == 'train' or mode == 'eval':
+        bbox = paddle.layer.data(
+            name='bbox', type=paddle.data_type.dense_vector_sequence(6))
+        loss = paddle.layer.multibox_loss(
+            input_loc=loc_loss_input,
+            input_conf=conf_loss_input,
+            priorbox=mbox_priorbox,
+            label=bbox,
+            num_classes=cfg.CLASS_NUM,
+            overlap_threshold=cfg.NET.MBLOSS.OVERLAP_THRESHOLD,
+            neg_pos_ratio=cfg.NET.MBLOSS.NEG_POS_RATIO,
+            neg_overlap=cfg.NET.MBLOSS.NEG_OVERLAP,
+            background_id=cfg.BACKGROUND_ID,
+            name="multibox_loss")
+        paddle.evaluator.detection_map(
+            input=detection_out,
+            label=bbox,
+            overlap_threshold=cfg.NET.DETMAP.OVERLAP_THRESHOLD,
+            background_id=cfg.BACKGROUND_ID,
+            evaluate_difficult=cfg.NET.DETMAP.EVAL_DIFFICULT,
+            ap_type=cfg.NET.DETMAP.AP_TYPE,
+            name="detection_evaluator")
+        return loss, detection_out
+    elif mode == 'infer':
+        return detection_out
diff --git a/ssd/visual.py b/ssd/visual.py
new file mode 100644
index 0000000000000000000000000000000000000000..278fd34af1a7e817012c27f38647f9ce76f0c803
--- /dev/null
+++ b/ssd/visual.py
@@ -0,0 +1,33 @@
+import cv2
+import os
+
+data_dir = './data'
+infer_file = './infer.res'
+out_dir = './visual_res'
+
+path_to_im = dict()
+
+for line in open(infer_file):
+    img_path, _, _, _ = line.strip().split('\t')
+    if img_path not in path_to_im:
+        im = cv2.imread(os.path.join(data_dir, img_path))
+        path_to_im[img_path] = im
+
+for line in open(infer_file):
+    img_path, label, conf, bbox = line.strip().split('\t')
+    xmin, ymin, xmax, ymax = map(float, bbox.split(' '))
+    xmin = int(round(xmin))
+    ymin = int(round(ymin))
+    xmax = int(round(xmax))
+    ymax = int(round(ymax))
+
+    img = path_to_im[img_path]
+    cv2.rectangle(img, (xmin, ymin), (xmax, ymax),
+                  (0, (1 - xmin) * 255, xmin * 255), 2)
+
+for img_path in path_to_im:
+    im = path_to_im[img_path]
+    out_path = os.path.join(out_dir, os.path.basename(img_path))
+    cv2.imwrite(out_path, im)
+
+print 'Done.'