Merge with the image-related guides

e01d3c62 · Yibing Liu · 60a3cae6 · 8222284e · e01d3c62 · e01d3c62
24 changed file
--- a/fluid/DeepASR/data_utils/async_data_reader.py
+++ b/fluid/DeepASR/data_utils/async_data_reader.py
@@ -185,6 +185,9 @@ class AsyncDataReader(object):
                               corresponding description file.
        drop_frame_len (int): Samples whose label length above the value will be
                              dropped.(Using '-1' to disable the policy)
+        split_sentence_threshold(int): Sentence whose length larger than
+                                the value will trigger split operation.
+                                (Assign -1 to disable split)
        proc_num (int): Number of processes for processing data.
        sample_buffer_size (int): Buffer size to indicate the maximum samples
                                  cached.
@@ -204,6 +207,7 @@ class AsyncDataReader(object):
                 feature_file_list,
                 label_file_list="",
                 drop_frame_len=512,
+                 split_sentence_threshold=1024,
                 proc_num=10,
                 sample_buffer_size=1024,
                 sample_info_buffer_size=1024,
@@ -214,6 +218,7 @@ class AsyncDataReader(object):
        self._feature_file_list = feature_file_list
        self._label_file_list = label_file_list
        self._drop_frame_len = drop_frame_len
+        self._split_sentence_threshold = split_sentence_threshold
        self._shuffle_block_num = shuffle_block_num
        self._block_info_list = None
        self._rng = random.Random(random_seed)
@@ -262,7 +267,8 @@ class AsyncDataReader(object):
                    map(lambda info: info[0], bucket_block_info),
                    map(lambda info: info[1], bucket_block_info),
                    map(lambda info: info[2], bucket_block_info),
-                    map(lambda info: info[3], bucket_block_info)))
+                    map(lambda info: info[3], bucket_block_info),
+                    split_sentence_threshold=self._split_sentence_threshold))

    # @TODO make this configurable
    def set_transformers(self, transformers):

--- a/fluid/DeepASR/examples/aishell/profile.sh
+++ b/fluid/DeepASR/examples/aishell/profile.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0
 python -u ../../tools/profile.py --feature_lst data/train_feature.lst \
                   --label_lst data/train_label.lst \
-                   --mean_var data/aishell/global_mean_var \
-                   --parallel \
+                   --mean_var data/global_mean_var \
                   --frame_dim 80  \
                   --class_num 3040  \
+                   --batch_size 16
--- a/fluid/DeepASR/examples/aishell/train.sh
+++ b/fluid/DeepASR/examples/aishell/train.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=4,5,6,7
 python -u ../../train.py --train_feature_lst data/train_feature.lst \
                   --train_label_lst data/train_label.lst \
                   --val_feature_lst data/val_feature.lst \
                   --val_label_lst data/val_label.lst \
-                   --mean_var data/aishell/global_mean_var \
+                   --mean_var data/global_mean_var \
                   --checkpoints checkpoints \
                   --frame_dim 80  \
                   --class_num 3040  \
@@ -11,4 +11,3 @@ python -u ../../train.py --train_feature_lst data/train_feature.lst \
                   --batch_size 64 \
                   --learning_rate 6.4e-5 \
                   --parallel
-~
--- a/fluid/DeepASR/infer_by_ckpt.py
+++ b/fluid/DeepASR/infer_by_ckpt.py
@@ -162,7 +162,12 @@ def infer_from_ckpt(args):

    infer_program = fluid.default_main_program().clone()

-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=args.learning_rate,
+            decay_steps=1879,
+            decay_rate=1 / 1.2,
+            staircase=True))
    optimizer.minimize(avg_cost)

    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)

--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
@@ -137,7 +137,12 @@ def profile(args):
        class_num=args.class_num,
        parallel=args.parallel)

-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=args.learning_rate,
+            decay_steps=1879,
+            decay_rate=1 / 1.2,
+            staircase=True))
    optimizer.minimize(avg_cost)

    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
@@ -150,7 +155,8 @@ def profile(args):
        trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5)
    ]

-    data_reader = reader.AsyncDataReader(args.feature_lst, args.label_lst, -1)
+    data_reader = reader.AsyncDataReader(
+        args.feature_lst, args.label_lst, -1, split_sentence_threshold=1024)
    data_reader.set_transformers(ltrans)

    feature_t = fluid.LoDTensor()

--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -159,7 +159,12 @@ def train(args):
    test_program = fluid.default_main_program().clone()

    #optimizer = fluid.optimizer.Momentum(learning_rate=args.learning_rate, momentum=0.9)
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer = fluid.optimizer.Adam(
+        learning_rate=fluid.layers.exponential_decay(
+            learning_rate=args.learning_rate,
+            decay_steps=1879,
+            decay_rate=1 / 1.2,
+            staircase=True))
    optimizer.minimize(avg_cost)

    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
@@ -186,8 +191,11 @@ def train(args):
                os.path.exists(args.val_label_lst)):
            return -1.0, -1.0
        # test data reader
-        test_data_reader = reader.AsyncDataReader(args.val_feature_lst,
-                                                  args.val_label_lst)
+        test_data_reader = reader.AsyncDataReader(
+            args.val_feature_lst,
+            args.val_label_lst,
+            -1,
+            split_sentence_threshold=1024)
        test_data_reader.set_transformers(ltrans)
        test_costs, test_accs = [], []
        for batch_id, batch_data in enumerate(
@@ -212,8 +220,11 @@ def train(args):
        return np.mean(test_costs), np.mean(test_accs)

    # train data reader
-    train_data_reader = reader.AsyncDataReader(args.train_feature_lst,
-                                               args.train_label_lst, -1)
+    train_data_reader = reader.AsyncDataReader(
+        args.train_feature_lst,
+        args.train_label_lst,
+        -1,
+        split_sentence_threshold=1024)

    train_data_reader.set_transformers(ltrans)
    # train

--- a/fluid/DeepQNetwork/README.md
+++ b/fluid/DeepQNetwork/README.md
-# Reproduce DQN, DoubleDQN, DuelingDQN model with fluid version of PaddlePaddle
+[中文版](README_cn.md)

-+ DQN in:
+## Reproduce DQN, DoubleDQN, DuelingDQN model with Fluid version of PaddlePaddle
+Based on PaddlePaddle's next-generation API Fluid, the DQN model of deep reinforcement learning is reproduced, and the same level of indicators of the paper is reproduced in the classic Atari game. The model receives the image of the game as input, and uses the end-to-end model to directly predict the next step. The repository contains the following three types of models.
+ DQN in
 [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
 + DoubleDQN in:
 [Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389)
 + DuelingDQN in:
 [Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html)

-# Atari benchmark & performance
-## [Atari games introduction](https://gym.openai.com/envs/#atari)
+## Atari benchmark & performance
+### [Atari games introduction](https://gym.openai.com/envs/#atari)

-+ Pong game result
+### Pong game result
+The average game rewards that can be obtained for the three models as the number of training steps changes during the training are as follows(about 3 hours/1 Million steps):
 ![DQN result](assets/dqn.png)

-# How to use
-+ Dependencies:
-    + python2.7
-    + gym
-    + tqdm
-    + paddlepaddle-gpu==0.12.0
-
-+ Start Training:
+## How to use
+### Dependencies:
+ python2.7
+ gym
+ tqdm
+ opencv-python
+ paddlepaddle-gpu>=0.12.0
+ ale_python_interface
+
+### Install Dependencies:
+ Install PaddlePaddle:
+    recommended to compile and install PaddlePaddle from source code
+ Install other dependencies:
+    ```
+    pip install -r requirement.txt
+    pip install gym[atari]
    ```
-    # To train a model for Pong game with gpu (use DQN model as default)
-    python train.py --rom ./rom_files/pong.bin --use_cuda
+    Install ale_python_interface, can reference：https://github.com/mgbellemare/Arcade-Learning-Environment

-    # To train a model for Pong with DoubleDQN
-    python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
+### Start Training:
+```
+# To train a model for Pong game with gpu (use DQN model as default)
+python train.py --rom ./rom_files/pong.bin --use_cuda

-    # To train a model for Pong with DuelingDQN
-    python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
-    ```
+# To train a model for Pong with DoubleDQN
+python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
+
+# To train a model for Pong with DuelingDQN
+python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
+```

 To train more games, can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms)

-+ Start Testing:
-    ```
-    # Play the game with saved model and calculate the average rewards
-    python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX
+### Start Testing:
+```
+# Play the game with saved best model and calculate the average rewards
+python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong

-    # Play the game with visualization
-    python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong/stepXXXXX --viz 0.01
-    ```
+# Play the game with visualization
+python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong --viz 0.01
+```
+[Here](https://pan.baidu.com/s/1gIsbNw5V7tMeb74ojx-TMA) is saved models for Pong and Breakout games. You can use it to play the game directly.
--- a/fluid/DeepQNetwork/README_cn.md
+++ b/fluid/DeepQNetwork/README_cn.md
+## 基于PaddlePaddle的Fluid版本复现DQN, DoubleDQN, DuelingDQN三个模型
+基于PaddlePaddle下一代API Fluid复现了深度强化学习领域的DQN模型，在经典的Atari 游戏上复现了论文同等水平的指标，模型接收游戏的图像作为输入，采用端到端的模型直接预测下一步要执行的控制信号，本仓库一共包含以下3类模型。
+ DQN模型：
+[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
+ DoubleDQN模型：
+[Deep Reinforcement Learning with Double Q-Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12389)
+ DuelingDQN模型：
+[Dueling Network Architectures for Deep Reinforcement Learning](http://proceedings.mlr.press/v48/wangf16.html)
+
+## 模型效果：Atari游戏表现
+### [Atari游戏介绍](https://gym.openai.com/envs/#atari)
+
+### Pong游戏训练结果
+三个模型在训练过程中随着训练步数的变化，能得到的平均游戏奖励如下图所示（大概3小时每1百万步）：
+![DQN result](assets/dqn.png)
+
+## 使用教程
+### 依赖:
+ python2.7
+ gym
+ tqdm
+ opencv-python
+ paddlepaddle-gpu>=0.12.0
+ ale_python_interface
+
+### 下载依赖：
+ 安装PaddlePaddle：
+    建议通过PaddlePaddle源码进行编译安装  
+ 下载其它依赖：
+    ```
+    pip install -r requirement.txt
+    pip install gym[atari]
+    ```
+    安装ale_python_interface可以参考：https://github.com/mgbellemare/Arcade-Learning-Environment
+
+### 训练模型：
+```
+# 使用GPU训练Pong游戏（默认使用DQN模型）
+python train.py --rom ./rom_files/pong.bin --use_cuda
+
+# 训练DoubleDQN模型
+python train.py --rom ./rom_files/pong.bin --use_cuda --alg DoubleDQN
+
+# 训练DuelingDQN模型
+python train.py --rom ./rom_files/pong.bin --use_cuda --alg DuelingDQN
+```
+
+训练更多游戏，可以下载游戏rom从[这里](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms)
+
+### 测试模型：
+```
+# Play the game with saved model and calculate the average rewards
+# 使用训练过程中保存的最好模型玩游戏，以及计算平均奖励（rewards）
+python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong
+
+# 以可视化的形式来玩游戏
+python play.py --rom ./rom_files/pong.bin --use_cuda --model_path ./saved_model/DQN-pong --viz 0.01
+```
+[这里](https://pan.baidu.com/s/1gIsbNw5V7tMeb74ojx-TMA)是Pong和Breakout游戏训练好的模型，可以直接用来测试。
--- a/fluid/DeepQNetwork/play.py
+++ b/fluid/DeepQNetwork/play.py
@@ -11,7 +11,7 @@ from tqdm import tqdm

 def predict_action(exe, state, predict_program, feed_names, fetch_targets,
                   action_dim):
-    if np.random.randint(100) == 0:
+    if np.random.random() < 0.01:
        act = np.random.randint(action_dim)
    else:
        state = np.expand_dims(state, axis=0)

--- a/fluid/DeepQNetwork/requirement.txt
+++ b/fluid/DeepQNetwork/requirement.txt
+numpy
+gym
+tqdm
+opencv-python
+paddlepaddle-gpu==0.12.0
--- a/fluid/DeepQNetwork/train.py
+++ b/fluid/DeepQNetwork/train.py
@@ -120,6 +120,9 @@ def train_agent():
    pbar = tqdm(total=1e8)
    recent_100_reward = []
    total_step = 0
+    max_reward = None
+    save_path = os.path.join(args.model_dirname, '{}-{}'.format(
+        args.alg, os.path.basename(args.rom).split('.')[0]))
    while True:
        # start epoch
        total_reward, step = run_train_episode(agent, env, exp)
@@ -134,14 +137,11 @@ def train_agent():
            print("eval_agent done, (steps, eval_reward): ({}, {})".format(
                total_step, eval_reward))

-        if total_step // args.save_every_steps == save_flag:
-            save_flag += 1
-            save_path = os.path.join(args.model_dirname, '{}-{}'.format(
-                args.alg, os.path.basename(args.rom).split('.')[0]),
-                                     'step{}'.format(total_step))
-            fluid.io.save_inference_model(save_path, ['state'],
-                                          agent.pred_value, agent.exe,
-                                          agent.predict_program)
+            if max_reward is None or eval_reward > max_reward:
+                max_reward = eval_reward
+                fluid.io.save_inference_model(save_path, ['state'],
+                                              agent.pred_value, agent.exe,
+                                              agent.predict_program)
    pbar.close()


@@ -173,11 +173,6 @@ if __name__ == '__main__':
        type=str,
        default='saved_model',
        help='dirname to save model')
-    parser.add_argument(
-        '--save_every_steps',
-        type=int,
-        default=100000,
-        help='every steps number to save model')
    parser.add_argument(
        '--test_every_steps',
        type=int,

--- a/fluid/README.cn.md
+++ b/fluid/README.cn.md
+# models 简介
+
+
+
+## 图像分类
+
+图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
+
+在深度学习时代，图像分类的准确率大幅度提升，在图像分类任务中，我们向大家介绍了如何在经典的数据集ImageNet上，训练常用的模型，包括AlexNet、VGG、GoogLeNet、ResNet、Inception-v4、MobileNet、DPN(Dual Path Network)、SE-ResNeXt模型，也开源了[训练的模型](https://github.com/PaddlePaddle/models/blob/develop/fluid/image_classification/README_cn.md#已有模型及其性能)方便用户下载使用。同时提供了能够将Caffe模型转换为PaddlePaddle Fluid模型配置和参数文件的工具。
+
+- [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [VGG](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [GoogleNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [MobileNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [Dual Path Network](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [SE-ResNeXt](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/models)
+- [Caffe模型转换为Paddle Fluid配置和模型文件工具](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+
+## 目标检测
+
+目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。
+
+在目标检测任务中，我们介绍了如何基于[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[MS COCO](http://cocodataset.org/#home)数据的训练目标检测算法SSD，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点，并开源了训练好的[MobileNet-SSD模型](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md#模型发布)。
+
+- [Single Shot MultiBox Detector](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md)
+
+
+## 图像语义分割
+
+图像语意分割顾名思义是将图像像素按照表达的语义含义的不同进行分组/分割，图像语义是指对图像内容的理解，例如，能够描绘出什么物体在哪里做了什么事情等，分割是指对图片中的每个像素点进行标注，标注属于哪一类别。近年来用在无人车驾驶技术中分割街景来避让行人和车辆、医疗影像分析中辅助诊断等。
+
+在图像语义分割任务中，我们介绍如何基于图像级联网络(Image Cascade Network,ICNet)进行语义分割，相比其他分割算法，ICNet兼顾了准确率和速度。
+
+
+- [ICNet](https://github.com/PaddlePaddle/models/tree/develop/fluid/icnet)
+
+
+## 场景文字识别
+
+许多场景图像中包含着丰富的文本信息，对理解图像信息有着重要作用，能够极大地帮助人们认知和理解场景图像的内容。场景文字识别是在图像背景复杂、分辨率低下、字体多样、分布随意等情况下，将图像信息转化为文字序列的过程，可认为是一种特别的翻译过程：将图像输入翻译为自然语言输出。场景图像文字识别技术的发展也促进了一些新型应用的产生，如通过自动识别路牌中的文字帮助街景应用获取更加准确的地址信息等。
+
+在场景文字识别任务中，我们介绍如何将基于CNN的图像特征提取和基于RNN的序列翻译技术结合，免除人工定义特征，避免字符分割，使用自动学习到的图像特征，完成端到端地无约束字符定位和识别。当前，介绍了CRNN-CTC模型，后续会引入基于注意力机制的序列到序列模型。
+
+- [CRNN-CTC模型](https://github.com/PaddlePaddle/models/tree/develop/fluid/ocr_recognition)
+
+
 ## 语音识别



--- a/fluid/face_detection/.gitignore
+++ b/fluid/face_detection/.gitignore
@@ -4,4 +4,6 @@ data/
 label/
 *.swp
 *.log
-infer_results/
+log*
+output*
+infer_results*
--- a/fluid/face_detection/data_util.py
+++ b/fluid/face_detection/data_util.py
+"""
+This code is based on https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py
+"""
+
+import time
+import numpy as np
+import threading
+import multiprocessing
+try:
+    import queue
+except ImportError:
+    import Queue as queue
+
+
+class GeneratorEnqueuer(object):
+    """
+    Builds a queue out of a data generator.
+
+    Args:
+        generator: a generator function which endlessly yields data
+        use_multiprocessing (bool): use multiprocessing if True,
+            otherwise use threading.
+        wait_time (float): time to sleep in-between calls to `put()`.
+        random_seed (int): Initial seed for workers,
+            will be incremented by one for each workers.
+    """
+
+    def __init__(self,
+                 generator,
+                 use_multiprocessing=False,
+                 wait_time=0.05,
+                 random_seed=None):
+        self.wait_time = wait_time
+        self._generator = generator
+        self._use_multiprocessing = use_multiprocessing
+        self._threads = []
+        self._stop_event = None
+        self.queue = None
+        self._manager = None
+        self.seed = random_seed
+
+    def start(self, workers=1, max_queue_size=10):
+        """
+        Start worker threads which add data from the generator into the queue.
+
+        Args:
+            workers (int): number of worker threads
+            max_queue_size (int): queue size
+                (when full, threads could block on `put()`)
+        """
+
+        def data_generator_task():
+            """
+            Data generator task.
+            """
+
+            def task():
+                if (self.queue is not None and
+                        self.queue.qsize() < max_queue_size):
+                    generator_output = next(self._generator)
+                    self.queue.put((generator_output))
+                else:
+                    time.sleep(self.wait_time)
+
+            if not self._use_multiprocessing:
+                while not self._stop_event.is_set():
+                    with self.genlock:
+                        try:
+                            task()
+                        except Exception:
+                            self._stop_event.set()
+                            break
+            else:
+                while not self._stop_event.is_set():
+                    try:
+                        task()
+                    except Exception:
+                        self._stop_event.set()
+                        break
+
+        try:
+            if self._use_multiprocessing:
+                self._manager = multiprocessing.Manager()
+                self.queue = self._manager.Queue(maxsize=max_queue_size)
+                self._stop_event = multiprocessing.Event()
+            else:
+                self.genlock = threading.Lock()
+                self.queue = queue.Queue()
+                self._stop_event = threading.Event()
+            for _ in range(workers):
+                if self._use_multiprocessing:
+                    # Reset random seed else all children processes
+                    # share the same seed
+                    np.random.seed(self.seed)
+                    thread = multiprocessing.Process(target=data_generator_task)
+                    thread.daemon = True
+                    if self.seed is not None:
+                        self.seed += 1
+                else:
+                    thread = threading.Thread(target=data_generator_task)
+                self._threads.append(thread)
+                thread.start()
+        except:
+            self.stop()
+            raise
+
+    def is_running(self):
+        """
+        Returns:
+            bool: Whether the worker theads are running.
+        """
+        return self._stop_event is not None and not self._stop_event.is_set()
+
+    def stop(self, timeout=None):
+        """
+        Stops running threads and wait for them to exit, if necessary.
+        Should be called by the same thread which called `start()`.
+
+        Args:
+            timeout(int|None): maximum time to wait on `thread.join()`.
+        """
+        if self.is_running():
+            self._stop_event.set()
+        for thread in self._threads:
+            if self._use_multiprocessing:
+                if thread.is_alive():
+                    thread.terminate()
+            else:
+                thread.join(timeout)
+        if self._manager:
+            self._manager.shutdown()
+
+        self._threads = []
+        self._stop_event = None
+        self.queue = None
+
+    def get(self):
+        """
+        Creates a generator to extract data from the queue.
+        Skip the data if it is `None`.
+
+        # Yields
+            tuple of data in the queue.
+        """
+        while self.is_running():
+            if not self.queue.empty():
+                inputs = self.queue.get()
+                if inputs is not None:
+                    yield inputs
+            else:
+                time.sleep(self.wait_time)
--- a/fluid/face_detection/image_util.py
+++ b/fluid/face_detection/image_util.py
@@ -3,6 +3,7 @@ from PIL import ImageFile
 import numpy as np
 import random
 import math
+import cv2

 ImageFile.LOAD_TRUNCATED_IMAGES = True  #otherwise IOError raised image file is truncated

@@ -107,10 +108,10 @@ def data_anchor_sampling(sampler, bbox_labels, image_width, image_height,
    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0

    if num_gt != 0:
-        norm_xmin = bbox_labels[rand_idx][0]
-        norm_ymin = bbox_labels[rand_idx][1]
-        norm_xmax = bbox_labels[rand_idx][2]
-        norm_ymax = bbox_labels[rand_idx][3]
+        norm_xmin = bbox_labels[rand_idx][1]
+        norm_ymin = bbox_labels[rand_idx][2]
+        norm_xmax = bbox_labels[rand_idx][3]
+        norm_ymax = bbox_labels[rand_idx][4]

        xmin = norm_xmin * image_width
        ymin = norm_ymin * image_height
@@ -321,7 +322,34 @@ def transform_labels(bbox_labels, sample_bbox):
    return sample_labels


-def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
+def transform_labels_sampling(bbox_labels, sample_bbox, resize_val,
+                              min_face_size):
+    sample_labels = []
+    for i in range(len(bbox_labels)):
+        sample_label = []
+        object_bbox = bbox(bbox_labels[i][1], bbox_labels[i][2],
+                           bbox_labels[i][3], bbox_labels[i][4])
+        if not meet_emit_constraint(object_bbox, sample_bbox):
+            continue
+        proj_bbox = project_bbox(object_bbox, sample_bbox)
+        if proj_bbox:
+            real_width = float((proj_bbox.xmax - proj_bbox.xmin) * resize_val)
+            real_height = float((proj_bbox.ymax - proj_bbox.ymin) * resize_val)
+            if real_width * real_height < float(min_face_size * min_face_size):
+                continue
+            else:
+                sample_label.append(bbox_labels[i][0])
+                sample_label.append(float(proj_bbox.xmin))
+                sample_label.append(float(proj_bbox.ymin))
+                sample_label.append(float(proj_bbox.xmax))
+                sample_label.append(float(proj_bbox.ymax))
+                sample_label = sample_label + bbox_labels[i][5:]
+                sample_labels.append(sample_label)
+    return sample_labels
+
+
+def crop_image(img, bbox_labels, sample_bbox, image_width, image_height,
+               resize_width, resize_height, min_face_size):
    sample_bbox = clip_bbox(sample_bbox)
    xmin = int(sample_bbox.xmin * image_width)
    xmax = int(sample_bbox.xmax * image_width)
@@ -329,12 +357,15 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
    ymax = int(sample_bbox.ymax * image_height)

    sample_img = img[ymin:ymax, xmin:xmax]
-    sample_labels = transform_labels(bbox_labels, sample_bbox)
+    resize_val = resize_width
+    sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
+                                              resize_val, min_face_size)
    return sample_img, sample_labels


 def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
-                        image_height, resize_width, resize_height):
+                        image_height, resize_width, resize_height,
+                        min_face_size):
    # no clipping here
    xmin = int(sample_bbox.xmin * image_width)
    xmax = int(sample_bbox.xmax * image_width)
@@ -358,14 +389,16 @@ def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
    roi_width = cross_width
    roi_height = cross_height

-    sample_img = np.zeros((width, height, 3))
-    sample_img[roi_xmin : roi_xmin + roi_width, roi_ymin : roi_ymin + roi_height] = \
-        img[cross_xmin : cross_xmin + cross_width, cross_ymin : cross_ymin + cross_height]
+    sample_img = np.zeros((height, width, 3))
+    sample_img[int(roi_ymin) : int(roi_ymin + roi_height), int(roi_xmin) : int(roi_xmin + roi_width)] = \
+        img[int(cross_ymin) : int(cross_ymin + cross_height), int(cross_xmin) : int(cross_xmin + cross_width)]

    sample_img = cv2.resize(
        sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA)

-    sample_labels = transform_labels(bbox_labels, sample_bbox)
+    resize_val = resize_width
+    sample_labels = transform_labels_sampling(bbox_labels, sample_bbox,
+                                              resize_val, min_face_size)
    return sample_img, sample_labels



--- a/fluid/face_detection/infer.py
+++ b/fluid/face_detection/infer.py
@@ -15,7 +15,7 @@ parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
-add_arg('use_pyramidbox',   bool,  False, "Whether use PyramidBox model.")
+add_arg('use_pyramidbox',   bool,  True, "Whether use PyramidBox model.")
 add_arg('confs_threshold',  float, 0.25,    "Confidence threshold to draw bbox.")
 add_arg('image_path',       str,   '',        "The data root path.")
 add_arg('model_dir',        str,   '',     "The model path.")
@@ -168,6 +168,9 @@ def detect_face(image, shrink):
                                     return_numpy=False)
                detection = np.array(detection)
    # layout: xmin, ymin, xmax. ymax, score
+    if detection.shape == (1, ):
+        print("No face detected")
+        return np.array([[0, 0, 0, 0, 0]])
    det_conf = detection[:, 1]
    det_xmin = image_shape[2] * detection[:, 2] / shrink
    det_ymin = image_shape[1] * detection[:, 3] / shrink
@@ -227,6 +230,33 @@ def multi_scale_test(image, max_shrink):
    return det_s, det_b


+def multi_scale_test_pyramid(image, max_shrink):
+    # shrink detecting and shrink only detect big face
+    det_b = detect_face(image, 0.25)
+    index = np.where(
+        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
+        > 30)[0]
+    det_b = det_b[index, :]
+
+    st = [0.5, 0.75, 1.25, 1.5, 1.75, 2.25]
+    for i in range(len(st)):
+        if (st[i] <= max_shrink):
+            det_temp = detect_face(image, st[i])
+            # enlarge only detect small face
+            if st[i] > 1:
+                index = np.where(
+                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
+                det_temp = det_temp[index, :]
+            else:
+                index = np.where(
+                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
+                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
+                det_temp = det_temp[index, :]
+            det_b = np.row_stack((det_b, det_temp))
+    return det_b
+
+
 def get_im_shrink(image_shape):
    max_shrink_v1 = (0x7fffffff / 577.0 /
                     (image_shape[1] * image_shape[2]))**0.5
@@ -272,7 +302,8 @@ def infer(args, batch_size, data_args):
        det0 = detect_face(image, shrink)
        det1 = flip_test(image, shrink)
        [det2, det3] = multi_scale_test(image, max_shrink)
-        det = np.row_stack((det0, det1, det2, det3))
+        det4 = multi_scale_test_pyramid(image, max_shrink)
+        det = np.row_stack((det0, det1, det2, det3, det4))
        dets = bbox_vote(det)

        image_name = image_path.split('/')[-1]

--- a/fluid/face_detection/profile.py
+++ b/fluid/face_detection/profile.py
+import os
+import shutil
+import numpy as np
+import time
+import argparse
+import functools
+
+import reader
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from pyramidbox import PyramidBox
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+
+# yapf: disable
+add_arg('parallel',         bool,  True,            "parallel")
+add_arg('learning_rate',    float, 0.001,           "Learning rate.")
+add_arg('batch_size',       int,   20,              "Minibatch size.")
+add_arg('num_iteration',    int,   10,              "Epoch number.")
+add_arg('skip_reader',      bool,  False,            "Whether to skip data reader.")
+add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
+add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
+add_arg('model_save_dir',   str,   'output',        "The path to save model.")
+add_arg('pretrained_model', str,   './pretrained/', "The init model path.")
+add_arg('resize_h',         int,   640,             "The resized image height.")
+add_arg('resize_w',         int,   640,             "The resized image height.")
+#yapf: enable
+
+
+def train(args, config, train_file_list, optimizer_method):
+    learning_rate = args.learning_rate
+    batch_size = args.batch_size
+    height = args.resize_h
+    width = args.resize_w
+    use_gpu = args.use_gpu
+    use_pyramidbox = args.use_pyramidbox
+    model_save_dir = args.model_save_dir
+    pretrained_model = args.pretrained_model
+    skip_reader = args.skip_reader
+    num_iterations = args.num_iteration
+    parallel = args.parallel
+
+    num_classes = 2
+    image_shape = [3, height, width]
+
+    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
+    devices_num = len(devices.split(","))
+
+    fetches = []
+    network = PyramidBox(image_shape, num_classes,
+                         sub_network=use_pyramidbox)
+    if use_pyramidbox:
+        face_loss, head_loss, loss = network.train()
+        fetches = [face_loss, head_loss]
+    else:
+        loss = network.vgg_ssd_loss()
+        fetches = [loss]
+
+    epocs = 12880 / batch_size
+    boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
+    values = [
+        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
+        learning_rate * 0.1, learning_rate * 0.01
+    ]
+
+    if optimizer_method == "momentum":
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=boundaries, values=values),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(0.0005),
+        )
+    else:
+        optimizer = fluid.optimizer.RMSProp(
+            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+            regularization=fluid.regularizer.L2Decay(0.0005),
+        )
+
+    optimizer.minimize(loss)
+    fluid.memory_optimize(fluid.default_main_program())
+
+    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    start_pass = 0
+    if pretrained_model:
+        if pretrained_model.isdigit():
+            start_pass = int(pretrained_model) + 1
+            pretrained_model = os.path.join(model_save_dir, pretrained_model)
+            print("Resume from %s " %(pretrained_model))
+
+        if not os.path.exists(pretrained_model):
+            raise ValueError("The pre-trained model path [%s] does not exist." %
+                             (pretrained_model))
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    if parallel:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_gpu, loss_name=loss.name)
+
+    train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)
+
+    def tensor(data, place, lod=None):
+        t = fluid.core.LoDTensor()
+        t.set(data, place)
+        if lod:
+            t.set_lod(lod)
+        return t
+
+    im, face_box, head_box, labels, lod = next(train_reader)
+    im_t = tensor(im, place)
+    box1 = tensor(face_box, place, [lod])
+    box2 = tensor(head_box, place, [lod])
+    lbl_t = tensor(labels, place, [lod])
+    feed_data = {'image': im_t, 'face_box': box1,
+                 'head_box': box2, 'gt_label': lbl_t}
+
+    def run(iterations, feed_data):
+        # global feed_data
+        reader_time = []
+        run_time = []
+        for batch_id in range(iterations):
+            start_time = time.time()
+            if not skip_reader:
+                im, face_box, head_box, labels, lod = next(train_reader)
+                im_t = tensor(im, place)
+                box1 = tensor(face_box, place, [lod])
+                box2 = tensor(head_box, place, [lod])
+                lbl_t = tensor(labels, place, [lod])
+                feed_data = {'image': im_t, 'face_box': box1,
+                             'head_box': box2, 'gt_label': lbl_t}
+            end_time = time.time()
+            reader_time.append(end_time - start_time)
+
+            start_time = time.time()
+            if parallel:
+                fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
+                                           feed=feed_data)
+            else:
+                fetch_vars = exe.run(fluid.default_main_program(),
+                                     feed=feed_data,
+                                     fetch_list=fetches)
+            end_time = time.time()
+            run_time.append(end_time - start_time)
+            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
+            if not args.use_pyramidbox:
+                print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
+            else:
+                print("Batch {0}, face loss {1}, head loss {2}".format(
+                       batch_id, fetch_vars[0], fetch_vars[1]))
+
+        return reader_time, run_time
+
+    # start-up
+    run(2, feed_data)
+
+    # profiling
+    start = time.time()
+    if not parallel:
+        with profiler.profiler('All', 'total', '/tmp/profile_file'):
+            reader_time, run_time = run(num_iterations, feed_data)
+    else:
+        reader_time, run_time = run(num_iterations, feed_data)
+    end = time.time()
+    total_time = end - start
+    print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
+        total_time, np.sum(reader_time), np.sum(run_time)))
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    data_dir = 'data/WIDERFACE/WIDER_train/images/'
+    train_file_list = 'label/train_gt_widerface.res'
+
+    config = reader.Settings(
+        data_dir=data_dir,
+        resize_h=args.resize_h,
+        resize_w=args.resize_w,
+        apply_expand=False,
+        mean_value=[104., 117., 123.],
+        ap_version='11point')
+    train(args, config, train_file_list, optimizer_method="momentum")
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -81,10 +81,7 @@ class PyramidBox(object):
        if self.is_infer:
            return [self.image]
        else:
-            return [
-                self.image, self.face_box, self.head_box, self.gt_label,
-                self.difficult
-            ]
+            return [self.image, self.face_box, self.head_box, self.gt_label]

    def _input(self):
        self.image = fluid.layers.data(
@@ -96,8 +93,6 @@ class PyramidBox(object):
                name='head_box', shape=[4], dtype='float32', lod_level=1)
            self.gt_label = fluid.layers.data(
                name='gt_label', shape=[1], dtype='int32', lod_level=1)
-            self.difficult = fluid.layers.data(
-                name='gt_difficult', shape=[1], dtype='int32', lod_level=1)

    def _vgg(self):
        self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2)
@@ -144,7 +139,8 @@ class PyramidBox(object):
                    stride=2,
                    groups=ch,
                    param_attr=w_attr,
-                    bias_attr=False)
+                    bias_attr=False,
+                    use_cudnn=True)
            else:
                upsampling = fluid.layers.resize_bilinear(
                    conv1, out_shape=up_to.shape[2:])
@@ -385,6 +381,7 @@ class PyramidBox(object):
            self.box_vars,
            overlap_threshold=0.35,
            neg_overlap=0.35)
+        face_loss.persistable = True
        head_loss = fluid.layers.ssd_loss(
            self.head_mbox_loc,
            self.head_mbox_conf,
@@ -394,9 +391,13 @@ class PyramidBox(object):
            self.box_vars,
            overlap_threshold=0.35,
            neg_overlap=0.35)
+        head_loss.persistable = True
        face_loss = fluid.layers.reduce_sum(face_loss)
+        face_loss.persistable = True
        head_loss = fluid.layers.reduce_sum(head_loss)
+        head_loss.persistable = True
        total_loss = face_loss + head_loss
+        total_loss.persistable = True
        return face_loss, head_loss, total_loss

    def infer(self, main_program=None):
@@ -410,5 +411,8 @@ class PyramidBox(object):
                self.face_mbox_conf,
                self.prior_boxes,
                self.box_vars,
-                nms_threshold=0.45)
+                nms_threshold=0.3,
+                nms_top_k=5000,
+                keep_top_k=750,
+                score_threshold=0.05)
        return test_program, face_nmsed_out
--- a/fluid/face_detection/reader.py
+++ b/fluid/face_detection/reader.py
@@ -23,6 +23,8 @@ import os
 import time
 import copy
 import random
+import cv2
+from data_util import GeneratorEnqueuer


 class Settings(object):
@@ -61,9 +63,29 @@ class Settings(object):
        self.brightness_delta = 0.125
        self.scale = 0.007843  # 1 / 127.5
        self.data_anchor_sampling_prob = 0.5
+        self.min_face_size = 8.0


-def preprocess(img, bbox_labels, mode, settings):
+def draw_image(faces_pred, img, resize_val):
+    for i in range(len(faces_pred)):
+        draw_rotate_rectange(img, faces_pred[i], resize_val, (0, 255, 0), 3)
+
+
+def draw_rotate_rectange(img, face, resize_val, color, thickness):
+    cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
+        face[3] * resize_val), int(face[2] * resize_val)), color, thickness)
+
+    cv2.line(img, (int(face[3] * resize_val), int(face[2] * resize_val)), (int(
+        face[3] * resize_val), int(face[4] * resize_val)), color, thickness)
+
+    cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
+        face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
+
+    cv2.line(img, (int(face[3] * resize_val), int(face[4] * resize_val)), (int(
+        face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
+
+
+def preprocess(img, bbox_labels, mode, settings, image_path):
    img_width, img_height = img.size
    sampled_labels = bbox_labels
    if mode == 'train':
@@ -86,13 +108,28 @@ def preprocess(img, bbox_labels, mode, settings):
                batch_sampler, bbox_labels, img_width, img_height, scale_array,
                settings.resize_width, settings.resize_height)
            img = np.array(img)
+            # Debug
+            # img_save = Image.fromarray(img)
+            # img_save.save('img_orig.jpg')
            if len(sampled_bbox) > 0:
                idx = int(random.uniform(0, len(sampled_bbox)))
                img, sampled_labels = image_util.crop_image_sampling(
                    img, bbox_labels, sampled_bbox[idx], img_width, img_height,
-                    resize_width, resize_heigh)
+                    settings.resize_width, settings.resize_height,
+                    settings.min_face_size)
+
+            img = img.astype('uint8')
+            # Debug: visualize the gt bbox
+            visualize_bbox = 0
+            if visualize_bbox:
+                img_show = img
+                draw_image(sampled_labels, img_show, settings.resize_height)
+                img_show = Image.fromarray(img_show)
+                img_show.save('final_img_show.jpg')

            img = Image.fromarray(img)
+            # Debug
+            # img.save('final_img.jpg')

        else:
            # hard-code here
@@ -118,7 +155,9 @@ def preprocess(img, bbox_labels, mode, settings):
            if len(sampled_bbox) > 0:
                idx = int(random.uniform(0, len(sampled_bbox)))
                img, sampled_labels = image_util.crop_image(
-                    img, bbox_labels, sampled_bbox[idx], img_width, img_height)
+                    img, bbox_labels, sampled_bbox[idx], img_width, img_height,
+                    settings.resize_width, settings.resize_height,
+                    settings.min_face_size)

            img = Image.fromarray(img)

@@ -146,20 +185,20 @@ def preprocess(img, bbox_labels, mode, settings):
    return img, sampled_labels


-def put_txt_in_dict(input_txt):
+def load_file_list(input_txt):
    with open(input_txt, 'r') as f_dir:
        lines_input_txt = f_dir.readlines()

-    dict_input_txt = {}
+    file_dict = {}
    num_class = 0
    for i in range(len(lines_input_txt)):
        tmp_line_txt = lines_input_txt[i].strip('\n\t\r')
        if '--' in tmp_line_txt:
            if i != 0:
                num_class += 1
-            dict_input_txt[num_class] = []
+            file_dict[num_class] = []
            dict_name = tmp_line_txt
-            dict_input_txt[num_class].append(tmp_line_txt)
+            file_dict[num_class].append(tmp_line_txt)
        if '--' not in tmp_line_txt:
            if len(tmp_line_txt) > 6:
                split_str = tmp_line_txt.split(' ')
@@ -169,11 +208,11 @@ def put_txt_in_dict(input_txt):
                y2_max = float(split_str[3])
                tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
                    x2_max) + ' ' + str(y2_max)
-                dict_input_txt[num_class].append(tmp_line_txt)
+                file_dict[num_class].append(tmp_line_txt)
            else:
-                dict_input_txt[num_class].append(tmp_line_txt)
+                file_dict[num_class].append(tmp_line_txt)

-    return dict_input_txt
+    return file_dict


 def expand_bboxes(bboxes,
@@ -200,67 +239,106 @@ def expand_bboxes(bboxes,
    return expand_boxes


-def pyramidbox(settings, file_list, mode, shuffle):
-
-    dict_input_txt = {}
-    dict_input_txt = put_txt_in_dict(file_list)
-
-    def reader():
-        if mode == 'train' and shuffle:
-            random.shuffle(dict_input_txt)
-        for index_image in range(len(dict_input_txt)):
+def train_generator(settings, file_list, batch_size, shuffle=True):
+    file_dict = load_file_list(file_list)
+    while True:
+        if shuffle:
+            random.shuffle(file_dict)
+        images, face_boxes, head_boxes, label_ids = [], [], [], []
+        label_offs = [0]

-            image_name = dict_input_txt[index_image][0] + '.jpg'
+        for index_image in file_dict.keys():
+            image_name = file_dict[index_image][0] + '.jpg'
            image_path = os.path.join(settings.data_dir, image_name)
-
            im = Image.open(image_path)
            if im.mode == 'L':
                im = im.convert('RGB')
            im_width, im_height = im.size

            # layout: label | xmin | ymin | xmax | ymax
-            if mode == 'train':
-                bbox_labels = []
-                for index_box in range(len(dict_input_txt[index_image])):
-                    if index_box >= 2:
-                        bbox_sample = []
-                        temp_info_box = dict_input_txt[index_image][
-                            index_box].split(' ')
-                        xmin = float(temp_info_box[0])
-                        ymin = float(temp_info_box[1])
-                        w = float(temp_info_box[2])
-                        h = float(temp_info_box[3])
-                        xmax = xmin + w
-                        ymax = ymin + h
-
-                        bbox_sample.append(1)
-                        bbox_sample.append(float(xmin) / im_width)
-                        bbox_sample.append(float(ymin) / im_height)
-                        bbox_sample.append(float(xmax) / im_width)
-                        bbox_sample.append(float(ymax) / im_height)
-                        bbox_labels.append(bbox_sample)
-
-                im, sample_labels = preprocess(im, bbox_labels, mode, settings)
-                sample_labels = np.array(sample_labels)
-                if len(sample_labels) == 0: continue
-                im = im.astype('float32')
-                boxes = sample_labels[:, 1:5]
-                lbls = [1] * len(boxes)
-                difficults = [1] * len(boxes)
-                yield im, boxes, expand_bboxes(boxes), lbls, difficults
-
-            if mode == 'test':
-                yield im, image_path
-
-    return reader
+            bbox_labels = []
+            for index_box in range(len(file_dict[index_image])):
+                if index_box >= 2:
+                    bbox_sample = []
+                    temp_info_box = file_dict[index_image][index_box].split(' ')
+                    xmin = float(temp_info_box[0])
+                    ymin = float(temp_info_box[1])
+                    w = float(temp_info_box[2])
+                    h = float(temp_info_box[3])
+                    xmax = xmin + w
+                    ymax = ymin + h
+
+                    bbox_sample.append(1)
+                    bbox_sample.append(float(xmin) / im_width)
+                    bbox_sample.append(float(ymin) / im_height)
+                    bbox_sample.append(float(xmax) / im_width)
+                    bbox_sample.append(float(ymax) / im_height)
+                    bbox_labels.append(bbox_sample)
+
+            im, sample_labels = preprocess(im, bbox_labels, "train", settings,
+                                           image_path)
+            sample_labels = np.array(sample_labels)
+            if len(sample_labels) == 0: continue
+
+            im = im.astype('float32')
+            face_box = sample_labels[:, 1:5]
+            head_box = expand_bboxes(face_box)
+            label = [1] * len(face_box)
+
+            images.append(im)
+            face_boxes.extend(face_box)
+            head_boxes.extend(head_box)
+            label_ids.extend(label)
+            label_offs.append(label_offs[-1] + len(face_box))
+
+            if len(images) == batch_size:
+                images = np.array(images).astype('float32')
+                face_boxes = np.array(face_boxes).astype('float32')
+                head_boxes = np.array(head_boxes).astype('float32')
+                label_ids = np.array(label_ids).astype('int32')
+                yield images, face_boxes, head_boxes, label_ids, label_offs
+                images, face_boxes, head_boxes = [], [], []
+                label_ids, label_offs = [], [0]
+
+
+def train_batch_reader(settings,
+                       file_list,
+                       batch_size,
+                       shuffle=True,
+                       num_workers=8):
+    try:
+        enqueuer = GeneratorEnqueuer(
+            train_generator(settings, file_list, batch_size, shuffle),
+            use_multiprocessing=False)
+        enqueuer.start(max_queue_size=24, workers=num_workers)
+        generator_output = None
+        while True:
+            while enqueuer.is_running():
+                if not enqueuer.queue.empty():
+                    generator_output = enqueuer.queue.get()
+                    break
+                else:
+                    time.sleep(0.01)
+            yield generator_output
+            generator_output = None
+    finally:
+        if enqueuer is not None:
+            enqueuer.stop()


-def train(settings, file_list, shuffle=True):
-    return pyramidbox(settings, file_list, 'train', shuffle)
+def test(settings, file_list):
+    file_dict = load_file_list(file_list)

+    def reader():
+        for index_image in file_dict.keys():
+            image_name = file_dict[index_image][0] + '.jpg'
+            image_path = os.path.join(settings.data_dir, image_name)
+            im = Image.open(image_path)
+            if im.mode == 'L':
+                im = im.convert('RGB')
+            yield im, image_path

-def test(settings, file_list):
-    return pyramidbox(settings, file_list, 'test', False)
+    return reader


 def infer(settings, image_path):

--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -18,13 +18,14 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('parallel',         bool,  True,            "parallel")
 add_arg('learning_rate',    float, 0.001,           "Learning rate.")
 add_arg('batch_size',       int,   12,              "Minibatch size.")
-add_arg('num_passes',       int,   120,             "Epoch number.")
+add_arg('num_passes',       int,   160,             "Epoch number.")
 add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
 add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
 add_arg('model_save_dir',   str,   'output',        "The path to save model.")
 add_arg('pretrained_model', str,   './pretrained/', "The init model path.")
 add_arg('resize_h',         int,   640,             "The resized image height.")
 add_arg('resize_w',         int,   640,             "The resized image height.")
+add_arg('with_mem_opt',     bool,  False,           "Whether to use memory optimization or not.")
 #yapf: enable


@@ -38,6 +39,7 @@ def train(args, config, train_file_list, optimizer_method):
    use_pyramidbox = args.use_pyramidbox
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
+    with_memory_optimization = args.with_mem_opt

    num_classes = 2
    image_shape = [3, height, width]
@@ -56,8 +58,9 @@ def train(args, config, train_file_list, optimizer_method):
        loss = network.vgg_ssd_loss()
        fetches = [loss]

-    epocs = 12880 / batch_size
-    boundaries = [epocs * 40, epocs * 60, epocs * 80, epocs * 100]
+    steps_per_pass = 12880 / batch_size
+    boundaries = [steps_per_pass * 50, steps_per_pass * 80,
+                  steps_per_pass * 120, steps_per_pass * 140]
    values = [
        learning_rate, learning_rate * 0.5, learning_rate * 0.25,
        learning_rate * 0.1, learning_rate * 0.01
@@ -77,7 +80,8 @@ def train(args, config, train_file_list, optimizer_method):
        )

    optimizer.minimize(loss)
-    #fluid.memory_optimize(fluid.default_main_program())
+    if with_memory_optimization:
+        fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
@@ -101,9 +105,7 @@ def train(args, config, train_file_list, optimizer_method):
        train_exe = fluid.ParallelExecutor(
            use_cuda=use_gpu, loss_name=loss.name)

-    train_reader = paddle.batch(
-        reader.train(config, train_file_list), batch_size=batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=network.feeds())
+    train_reader = reader.train_batch_reader(config, train_file_list, batch_size=batch_size)

    def save_model(postfix):
        model_path = os.path.join(model_save_dir, postfix)
@@ -112,20 +114,34 @@ def train(args, config, train_file_list, optimizer_method):
        print 'save models to %s' % (model_path)
        fluid.io.save_persistables(exe, model_path)

+    def tensor(data, place, lod=None):
+        t = fluid.core.LoDTensor()
+        t.set(data, place)
+        if lod:
+            t.set_lod(lod)
+        return t
+
    for pass_id in range(start_pass, num_passes):
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
-        for batch_id, data in enumerate(train_reader()):
+        for batch_id in range(steps_per_pass):
+            im, face_box, head_box, labels, lod = next(train_reader)
+            im_t = tensor(im, place)
+            box1 = tensor(face_box, place, [lod])
+            box2 = tensor(head_box, place, [lod])
+            lbl_t = tensor(labels, place, [lod])
+            feeding = {'image': im_t, 'face_box': box1,
+                       'head_box': box2, 'gt_label': lbl_t}
+
            prev_start_time = start_time
            start_time = time.time()
-            if len(data) < 2 * devices_num: continue
            if args.parallel:
                fetch_vars = train_exe.run(fetch_list=[v.name for v in fetches],
-                                           feed=feeder.feed(data))
+                                           feed=feeding)
            else:
                fetch_vars = exe.run(fluid.default_main_program(),
-                                     feed=feeder.feed(data),
+                                     feed=feeding,
                                     fetch_list=fetches)
            end_time = time.time()
            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
@@ -155,7 +171,8 @@ if __name__ == '__main__':
        data_dir=data_dir,
        resize_h=args.resize_h,
        resize_w=args.resize_w,
+        apply_distort=True,
        apply_expand=False,
-        mean_value=[104., 117., 123],
+        mean_value=[104., 117., 123.],
        ap_version='11point')
    train(args, config, train_file_list, optimizer_method="momentum")
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -38,7 +38,7 @@ class InferTaskConfig(object):
    batch_size = 10
    # the parameters for beam search.
    beam_size = 5
-    max_length = 256
+    max_out_len = 256
    # the number of decoded sentences to output.
    n_best = 1
    # the flags indicating whether to output the special tokens.
@@ -104,23 +104,28 @@ def merge_cfg_from_list(cfg_list, g_cfgs):
                break


+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = -1
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
 # Here list the data shapes and data types of all inputs.
 # The shapes here act as placeholder and are set to pass the infer-shape in
 # compile time.
 input_descs = {
    # The actual data shape of src_word is:
    # [batch_size * max_src_len_in_batch, 1]
-    "src_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    "src_word": [(batch_size * seq_len, 1L), "int64", 2],
    # The actual data shape of src_pos is:
    # [batch_size * max_src_len_in_batch, 1]
-    "src_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    "src_pos": [(batch_size * seq_len, 1L), "int64"],
    # This input is used to remove attention weights on paddings in the
    # encoder.
    # The actual data shape of src_slf_attn_bias is:
    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias":
-    [(1, ModelHyperParams.n_head, (ModelHyperParams.max_length + 1),
-      (ModelHyperParams.max_length + 1)), "float32"],
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
    # This shape input is used to reshape the output of embedding layer.
    "src_data_shape": [(3L, ), "int32"],
    # This shape input is used to reshape before softmax in self attention.
@@ -129,24 +134,23 @@ input_descs = {
    "src_slf_attn_post_softmax_shape": [(4L, ), "int32"],
    # The actual data shape of trg_word is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    "trg_word": [(batch_size * seq_len, 1L), "int64",
+                 2],  # lod_level is only used in fast decoder.
    # The actual data shape of trg_pos is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "trg_pos": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    "trg_pos": [(batch_size * seq_len, 1L), "int64"],
    # This input is used to remove attention weights on paddings and
    # subsequent words in the decoder.
    # The actual data shape of trg_slf_attn_bias is:
    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias": [(1, ModelHyperParams.n_head,
-                           (ModelHyperParams.max_length + 1),
-                           (ModelHyperParams.max_length + 1)), "float32"],
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
    # This input is used to remove attention weights on paddings of the source
    # input in the encoder-decoder attention.
    # The actual data shape of trg_src_attn_bias is:
    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias": [(1, ModelHyperParams.n_head,
-                           (ModelHyperParams.max_length + 1),
-                           (ModelHyperParams.max_length + 1)), "float32"],
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
    # This shape input is used to reshape the output of embedding layer.
    "trg_data_shape": [(3L, ), "int32"],
    # This shape input is used to reshape before softmax in self attention.
@@ -162,15 +166,18 @@ input_descs = {
    # This input is used in independent decoder program for inference.
    # The actual data shape of enc_output is:
    # [batch_size, max_src_len_in_batch, d_model]
-    "enc_output": [(1, (ModelHyperParams.max_length + 1),
-                    ModelHyperParams.d_model), "float32"],
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
    # The actual data shape of label_word is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(1 * (ModelHyperParams.max_length + 1), 1L), "int64"],
+    "lbl_word": [(batch_size * seq_len, 1L), "int64"],
    # This input is used to mask out the loss of paddding tokens.
    # The actual data shape of label_weight is:
    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(1 * (ModelHyperParams.max_length + 1), 1L), "float32"],
+    "lbl_weight": [(batch_size * seq_len, 1L), "float32"],
+    # These inputs are used to change the shape tensor in beam-search decoder.
+    "trg_slf_attn_pre_softmax_shape_delta": [(2L, ), "int32"],
+    "trg_slf_attn_post_softmax_shape_delta": [(4L, ), "int32"],
+    "init_score": [(batch_size, 1L), "float32"],
 }

 # Names of word embedding table which might be reused for weight sharing.
@@ -205,3 +212,12 @@ decoder_util_input_fields = (
 label_data_input_fields = (
    "lbl_word",
    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "trg_src_attn_bias", )
+fast_decoder_util_input_fields = decoder_util_input_fields + (
+    "trg_slf_attn_pre_softmax_shape_delta",
+    "trg_slf_attn_post_softmax_shape_delta", )
--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -7,6 +7,7 @@ import paddle.fluid as fluid
 import model
 from model import wrap_encoder as encoder
 from model import wrap_decoder as decoder
+from model import fast_decode as fast_decoder
 from config import *
 from train import pad_batch_data
 import reader
@@ -87,7 +88,8 @@ def translate_batch(exe,
                    output_unk=True):
    """
    Run the encoder program once and run the decoder program multiple times to
-    implement beam search externally.
+    implement beam search externally. This is deprecated since a faster beam
+    search decoder based solely on Fluid operators has been added.
    """
    # Prepare data for encoder and run the encoder.
    enc_in_data = pad_batch_data(
@@ -297,7 +299,32 @@ def translate_batch(exe,
    return seqs, scores[:, :n_best].tolist()


-def infer(args):
+def post_process_seq(seq,
+                     bos_idx=ModelHyperParams.bos_idx,
+                     eos_idx=ModelHyperParams.eos_idx,
+                     output_bos=InferTaskConfig.output_bos,
+                     output_eos=InferTaskConfig.output_eos):
+    """
+    Post-process the beam-search decoded sequence. Truncate from the first
+    <eos> and remove the <bos> and <eos> tokens currently.
+    """
+    eos_pos = len(seq) - 1
+    for i, idx in enumerate(seq):
+        if idx == eos_idx:
+            eos_pos = i
+            break
+    seq = seq[:eos_pos + 1]
+    return filter(
+        lambda idx: (output_bos or idx != bos_idx) and \
+            (output_eos or idx != eos_idx),
+        seq)
+
+
+def py_infer(test_data, trg_idx2word):
+    """
+    Inference by beam search implented by python, while the calculations from
+    symbols to probilities execute by Fluid operators.
+    """
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

@@ -341,49 +368,8 @@ def infer(args):
    fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)

    # This is used here to set dropout to the test mode.
-    encoder_program = fluid.io.get_inference_program(
-        target_vars=[enc_output], main_program=encoder_program)
-    decoder_program = fluid.io.get_inference_program(
-        target_vars=[predict], main_program=decoder_program)
-
-    test_data = reader.DataReader(
-        src_vocab_fpath=args.src_vocab_fpath,
-        trg_vocab_fpath=args.trg_vocab_fpath,
-        fpattern=args.test_file_pattern,
-        batch_size=args.batch_size,
-        use_token_batch=False,
-        pool_size=args.pool_size,
-        sort_type=reader.SortType.NONE,
-        shuffle=False,
-        shuffle_batch=False,
-        start_mark=args.special_token[0],
-        end_mark=args.special_token[1],
-        unk_mark=args.special_token[2],
-        max_length=ModelHyperParams.max_length,
-        clip_last_batch=False)
-
-    trg_idx2word = test_data.load_dict(
-        dict_path=args.trg_vocab_fpath, reverse=True)
-
-    def post_process_seq(seq,
-                         bos_idx=ModelHyperParams.bos_idx,
-                         eos_idx=ModelHyperParams.eos_idx,
-                         output_bos=InferTaskConfig.output_bos,
-                         output_eos=InferTaskConfig.output_eos):
-        """
-        Post-process the beam-search decoded sequence. Truncate from the first
-        <eos> and remove the <bos> and <eos> tokens currently.
-        """
-        eos_pos = len(seq) - 1
-        for i, idx in enumerate(seq):
-            if idx == eos_idx:
-                eos_pos = i
-                break
-        seq = seq[:eos_pos + 1]
-        return filter(
-            lambda idx: (output_bos or idx != bos_idx) and \
-                (output_eos or idx != eos_idx),
-            seq)
+    encoder_program = encoder_program.inference_optimize()
+    decoder_program = decoder_program.inference_optimize()

    for batch_id, data in enumerate(test_data.batch_generator()):
        batch_seqs, batch_scores = translate_batch(
@@ -397,7 +383,7 @@ def infer(args):
            (decoder_data_input_fields[-1], ),
            [predict.name],
            InferTaskConfig.beam_size,
-            InferTaskConfig.max_length,
+            InferTaskConfig.max_out_len,
            InferTaskConfig.n_best,
            len(data),
            ModelHyperParams.n_head,
@@ -416,6 +402,154 @@ def infer(args):
                print(" ".join([trg_idx2word[idx] for idx in seq]))


+def prepare_batch_input(insts, data_input_names, util_input_names, src_pad_idx,
+                        bos_idx, n_head, d_model, place):
+    """
+    Put all padded data needed by beam search decoder into a dict.
+    """
+    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
+    # start tokens
+    trg_word = np.asarray([[bos_idx]] * len(insts), dtype="int64")
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, 1, 1]).astype("float32")
+
+    # These shape tensors are used in reshape_op.
+    src_data_shape = np.array([-1, src_max_len, d_model], dtype="int32")
+    trg_data_shape = np.array([-1, 1, d_model], dtype="int32")
+    src_slf_attn_pre_softmax_shape = np.array(
+        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
+    src_slf_attn_post_softmax_shape = np.array(
+        [-1] + list(src_slf_attn_bias.shape[1:]), dtype="int32")
+    trg_slf_attn_pre_softmax_shape = np.array(
+        [-1, 1], dtype="int32")  # only the first time step
+    trg_slf_attn_post_softmax_shape = np.array(
+        [-1, n_head, 1, 1], dtype="int32")  # only the first time step
+    trg_src_attn_pre_softmax_shape = np.array(
+        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
+    trg_src_attn_post_softmax_shape = np.array(
+        [-1] + list(trg_src_attn_bias.shape[1:]), dtype="int32")
+    # These inputs are used to change the shapes in the loop of while op.
+    attn_pre_softmax_shape_delta = np.array([0, 1], dtype="int32")
+    attn_post_softmax_shape_delta = np.array([0, 0, 0, 1], dtype="int32")
+
+    def to_lodtensor(data, place, lod=None):
+        data_tensor = fluid.LoDTensor()
+        data_tensor.set(data, place)
+        if lod is not None:
+            data_tensor.set_lod(lod)
+        return data_tensor
+
+    # beamsearch_op must use tensors with lod
+    init_score = to_lodtensor(
+        np.zeros_like(
+            trg_word, dtype="float32"),
+        place, [range(trg_word.shape[0] + 1)] * 2)
+    trg_word = to_lodtensor(trg_word, place, [range(trg_word.shape[0] + 1)] * 2)
+
+    data_input_dict = dict(
+        zip(data_input_names, [
+            src_word, src_pos, src_slf_attn_bias, trg_word, init_score,
+            trg_src_attn_bias
+        ]))
+    util_input_dict = dict(
+        zip(util_input_names, [
+            src_data_shape, src_slf_attn_pre_softmax_shape,
+            src_slf_attn_post_softmax_shape, trg_data_shape,
+            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
+            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape,
+            attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta
+        ]))
+
+    input_dict = dict(data_input_dict.items() + util_input_dict.items())
+    return input_dict
+
+
+def fast_infer(test_data, trg_idx2word):
+    """
+    Inference by beam search decoder based solely on Fluid operators.
+    """
+    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    out_ids, out_scores = fast_decoder(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, InferTaskConfig.beam_size,
+        InferTaskConfig.max_out_len, ModelHyperParams.eos_idx)
+
+    fluid.io.load_vars(
+        exe,
+        InferTaskConfig.model_path,
+        vars=filter(lambda var: isinstance(var, fluid.framework.Parameter),
+                    fluid.default_main_program().list_vars()))
+
+    # This is used here to set dropout to the test mode.
+    infer_program = fluid.default_main_program().inference_optimize()
+
+    for batch_id, data in enumerate(test_data.batch_generator()):
+        data_input = prepare_batch_input(
+            data, encoder_data_input_fields + fast_decoder_data_input_fields,
+            encoder_util_input_fields + fast_decoder_util_input_fields,
+            ModelHyperParams.eos_idx, ModelHyperParams.bos_idx,
+            ModelHyperParams.n_head, ModelHyperParams.d_model, place)
+        seq_ids, seq_scores = exe.run(infer_program,
+                                      feed=data_input,
+                                      fetch_list=[out_ids, out_scores],
+                                      return_numpy=False)
+        # How to parse the results:
+        #   Suppose the lod of seq_ids is:
+        #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
+        #   then from lod[0]:
+        #     there are 2 source sentences, beam width is 3.
+        #   from lod[1]:
+        #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
+        #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
+        hyps = [[] for i in range(len(data))]
+        scores = [[] for i in range(len(data))]
+        for i in range(len(seq_ids.lod()[0]) - 1):  # for each source sentence
+            start = seq_ids.lod()[0][i]
+            end = seq_ids.lod()[0][i + 1]
+            for j in range(end - start):  # for each candidate
+                sub_start = seq_ids.lod()[1][start + j]
+                sub_end = seq_ids.lod()[1][start + j + 1]
+                hyps[i].append(" ".join([
+                    trg_idx2word[idx]
+                    for idx in post_process_seq(
+                        np.array(seq_ids)[sub_start:sub_end])
+                ]))
+                scores[i].append(np.array(seq_scores)[sub_end - 1])
+                print hyps[i][-1]
+                if len(hyps[i]) >= InferTaskConfig.n_best:
+                    break
+
+
+def infer(args, inferencer=fast_infer):
+    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    test_data = reader.DataReader(
+        src_vocab_fpath=args.src_vocab_fpath,
+        trg_vocab_fpath=args.trg_vocab_fpath,
+        fpattern=args.test_file_pattern,
+        batch_size=args.batch_size,
+        use_token_batch=False,
+        pool_size=args.pool_size,
+        sort_type=reader.SortType.NONE,
+        shuffle=False,
+        shuffle_batch=False,
+        start_mark=args.special_token[0],
+        end_mark=args.special_token[1],
+        unk_mark=args.special_token[2],
+        max_length=ModelHyperParams.max_length,
+        clip_last_batch=False)
+    trg_idx2word = test_data.load_dict(
+        dict_path=args.trg_vocab_fpath, reverse=True)
+    inferencer(test_data, trg_idx2word)
+
+
 if __name__ == "__main__":
    args = parse_args()
    infer(args)
--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -30,7 +30,8 @@ def multi_head_attention(queries,
                         n_head=1,
                         dropout_rate=0.,
                         pre_softmax_shape=None,
-                         post_softmax_shape=None):
+                         post_softmax_shape=None,
+                         cache=None):
    """
    Multi-Head Attention. Note that attn_bias is added to the logit before
    computing softmax activiation to mask certain selected positions so that
@@ -116,6 +117,10 @@ def multi_head_attention(queries,

    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)

+    if cache is not None:  # use cache and concat time steps
+        k = cache["k"] = layers.concat([cache["k"], k], axis=1)
+        v = cache["v"] = layers.concat([cache["v"], v], axis=1)
+
    q = __split_heads(q, n_head)
    k = __split_heads(k, n_head)
    v = __split_heads(v, n_head)
@@ -203,7 +208,7 @@ def prepare_encoder(src_word,
    enc_input = src_word_emb + src_pos_enc
    enc_input = layers.reshape(
        x=enc_input,
-        shape=[-1, src_max_len, src_emb_dim],
+        shape=[batch_size, seq_len, src_emb_dim],
        actual_shape=src_data_shape)
    return layers.dropout(
        enc_input, dropout_prob=dropout_rate,
@@ -285,7 +290,8 @@ def decoder_layer(dec_input,
                  slf_attn_pre_softmax_shape=None,
                  slf_attn_post_softmax_shape=None,
                  src_attn_pre_softmax_shape=None,
-                  src_attn_post_softmax_shape=None):
+                  src_attn_post_softmax_shape=None,
+                  cache=None):
    """ The layer to be stacked in decoder part.
    The structure of this module is similar to that in the encoder part except
    a multi-head attention is added to implement encoder-decoder attention.
@@ -301,7 +307,8 @@ def decoder_layer(dec_input,
        n_head,
        dropout_rate,
        slf_attn_pre_softmax_shape,
-        slf_attn_post_softmax_shape, )
+        slf_attn_post_softmax_shape,
+        cache, )
    slf_attn_output = post_process_layer(
        dec_input,
        slf_attn_output,
@@ -350,7 +357,8 @@ def decoder(dec_input,
            slf_attn_pre_softmax_shape=None,
            slf_attn_post_softmax_shape=None,
            src_attn_pre_softmax_shape=None,
-            src_attn_post_softmax_shape=None):
+            src_attn_post_softmax_shape=None,
+            caches=None):
    """
    The decoder is composed of a stack of identical decoder_layer layers.
    """
@@ -369,7 +377,8 @@ def decoder(dec_input,
            slf_attn_pre_softmax_shape,
            slf_attn_post_softmax_shape,
            src_attn_pre_softmax_shape,
-            src_attn_post_softmax_shape, )
+            src_attn_post_softmax_shape,
+            None if caches is None else caches[i], )
        dec_input = dec_output
    return dec_output

@@ -384,6 +393,8 @@ def make_all_inputs(input_fields):
            name=input_field,
            shape=input_descs[input_field][0],
            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
            append_batch_size=False)
        inputs.append(input_var)
    return inputs
@@ -517,7 +528,8 @@ def wrap_decoder(trg_vocab_size,
                 dropout_rate,
                 weight_sharing,
                 dec_inputs=None,
-                 enc_output=None):
+                 enc_output=None,
+                 caches=None):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
@@ -559,7 +571,8 @@ def wrap_decoder(trg_vocab_size,
        slf_attn_pre_softmax_shape,
        slf_attn_post_softmax_shape,
        src_attn_pre_softmax_shape,
-        src_attn_post_softmax_shape, )
+        src_attn_post_softmax_shape,
+        caches, )
    # Return logits for training and probs for inference.
    if weight_sharing:
        predict = layers.reshape(
@@ -578,3 +591,145 @@ def wrap_decoder(trg_vocab_size,
            shape=[-1, trg_vocab_size],
            act="softmax" if dec_inputs is None else None)
    return predict
+
+
+def fast_decode(
+        src_vocab_size,
+        trg_vocab_size,
+        max_in_len,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        weight_sharing,
+        beam_size,
+        max_out_len,
+        eos_idx, ):
+    """
+    Use beam search to decode. Caches will be used to store states of history
+    steps which can make the decoding faster.
+    """
+    enc_output = wrap_encoder(src_vocab_size, max_in_len, n_layer, n_head,
+                              d_key, d_value, d_model, d_inner_hid,
+                              dropout_rate, weight_sharing)
+    start_tokens, init_scores, trg_src_attn_bias, trg_data_shape, \
+        slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \
+        src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \
+        attn_pre_softmax_shape_delta, attn_post_softmax_shape_delta = \
+        make_all_inputs(fast_decoder_data_input_fields +
+                            fast_decoder_util_input_fields)
+
+    def beam_search():
+        max_len = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=max_out_len)
+        step_idx = layers.fill_constant(
+            shape=[1], dtype=start_tokens.dtype, value=0)
+        cond = layers.less_than(x=step_idx, y=max_len)
+        while_op = layers.While(cond)
+        # array states will be stored for each step.
+        ids = layers.array_write(start_tokens, step_idx)
+        scores = layers.array_write(init_scores, step_idx)
+        # cell states will be overwrited at each step.
+        # caches contains states of history steps to reduce redundant
+        # computation in decoder.
+        caches = [{
+            "k": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0),
+            "v": layers.fill_constant_batch_size_like(
+                input=start_tokens,
+                shape=[-1, 0, d_model],
+                dtype=enc_output.dtype,
+                value=0)
+        } for i in range(n_layer)]
+        with while_op.block():
+            pre_ids = layers.array_read(array=ids, i=step_idx)
+            pre_scores = layers.array_read(array=scores, i=step_idx)
+            # sequence_expand can gather sequences according to lod thus can be
+            # used in beam search to sift states corresponding to selected ids.
+            pre_src_attn_bias = layers.sequence_expand(
+                x=trg_src_attn_bias, y=pre_scores)
+            pre_enc_output = layers.sequence_expand(x=enc_output, y=pre_scores)
+            pre_caches = [{
+                "k": layers.sequence_expand(
+                    x=cache["k"], y=pre_scores),
+                "v": layers.sequence_expand(
+                    x=cache["v"], y=pre_scores),
+            } for cache in caches]
+            pre_pos = layers.elementwise_mul(
+                x=layers.fill_constant_batch_size_like(
+                    input=pre_enc_output,  # cann't use pre_ids here since it has lod
+                    value=1,
+                    shape=[-1, 1],
+                    dtype=pre_ids.dtype),
+                y=layers.increment(
+                    x=step_idx, value=1.0, in_place=False),
+                axis=0)
+            logits = wrap_decoder(
+                trg_vocab_size,
+                max_in_len,
+                n_layer,
+                n_head,
+                d_key,
+                d_value,
+                d_model,
+                d_inner_hid,
+                dropout_rate,
+                weight_sharing,
+                dec_inputs=(
+                    pre_ids, pre_pos, None, pre_src_attn_bias, trg_data_shape,
+                    slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape,
+                    src_attn_pre_softmax_shape, src_attn_post_softmax_shape),
+                enc_output=pre_enc_output,
+                caches=pre_caches)
+            topk_scores, topk_indices = layers.topk(
+                input=layers.softmax(logits), k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(topk_scores),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            # beam_search op uses lod to distinguish branches.
+            topk_indices = layers.lod_reset(topk_indices, pre_ids)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=eos_idx)
+            layers.increment(x=step_idx, value=1.0, in_place=True)
+            # update states
+            layers.array_write(selected_ids, i=step_idx, array=ids)
+            layers.array_write(selected_scores, i=step_idx, array=scores)
+            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
+            layers.assign(pre_enc_output, enc_output)
+            for i in range(n_layer):
+                layers.assign(pre_caches[i]["k"], caches[i]["k"])
+                layers.assign(pre_caches[i]["v"], caches[i]["v"])
+            layers.assign(
+                layers.elementwise_add(
+                    x=slf_attn_pre_softmax_shape,
+                    y=attn_pre_softmax_shape_delta),
+                slf_attn_pre_softmax_shape)
+            layers.assign(
+                layers.elementwise_add(
+                    x=slf_attn_post_softmax_shape,
+                    y=attn_post_softmax_shape_delta),
+                slf_attn_post_softmax_shape)
+
+            length_cond = layers.less_than(x=step_idx, y=max_len)
+            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
+            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
+
+        finished_ids, finished_scores = layers.beam_search_decode(
+            ids, scores, beam_size=beam_size, end_id=eos_idx)
+        return finished_ids, finished_scores
+
+    finished_ids, finished_scores = beam_search()
+    return finished_ids, finished_scores
--- a/fluid/neural_machine_translation/transformer/reader.py
+++ b/fluid/neural_machine_translation/transformer/reader.py
@@ -198,7 +198,8 @@ class DataReader(object):
        for line in f_obj:
            fields = line.strip().split(self._delimiter)

-            if len(fields) != 2 or (self._only_src and len(fields) != 1):
+            if (not self._only_src and len(fields) != 2) or (self._only_src and
+                                                             len(fields) != 1):
                continue

            sample_words = []
@@ -275,7 +276,7 @@ class DataReader(object):

        for sample_idx in self._sample_idxs:
            if self._only_src:
-                yield (self._src_seq_ids[sample_idx])
+                yield (self._src_seq_ids[sample_idx], )
            else:
                yield (self._src_seq_ids[sample_idx],
                       self._trg_seq_ids[sample_idx][:-1],