Merge branch 'develop' of https://github.com/PaddlePaddle/models into add-WMT-enfr

88142779 · guosheng · 33be7a97 · bb59b41a · 88142779 · 88142779
41 changed file
--- a/fluid/DeepASR/examples/aishell/download_pretrained_model.sh
+++ b/fluid/DeepASR/examples/aishell/download_pretrained_model.sh
+url=http://deep-asr-data.gz.bcebos.com/aishell_pretrained_model.tar.gz
+md5=7b51bde64e884f43901b7a3461ccbfa3
+
+wget -c $url
+
+echo "Checking md5 sum ..."
+md5sum_tmp=`md5sum aishell_pretrained_model.tar.gz | cut -d ' ' -f1`
+
+if [ $md5sum_tmp !=  $md5 ]; then
+    echo "Md5sum check failed, please remove and redownload "
+          "aishell_pretrained_model.tar.gz."
+    exit 1
+fi
+
+tar xvf aishell_pretrained_model.tar.gz 
--- a/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh
+++ b/fluid/DeepASR/examples/aishell/infer_by_ckpt.sh
 decode_to_path=./decoding_result.txt

-export CUDA_VISIBLE_DEVICES=2,3,4,5
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 python -u ../../infer_by_ckpt.py --batch_size 96  \
-                        --checkpoint checkpoints/deep_asr.pass_20.checkpoint \
+                        --checkpoint checkpoints/deep_asr.latest.checkpoint \
                        --infer_feature_lst data/test_feature.lst  \
                        --mean_var data/global_mean_var \
                        --frame_dim 80  \
@@ -10,9 +10,9 @@ python -u ../../infer_by_ckpt.py --batch_size 96  \
                        --num_threads 24  \
                        --beam_size 11 \
                        --decode_to_path $decode_to_path \
-                        --trans_model mapped_decoder_data/exp/tri5a/final.mdl \
-                        --log_prior mapped_decoder_data/logprior \
-                        --vocabulary mapped_decoder_data/exp/tri5a/graph/words.txt \
-                        --graphs mapped_decoder_data/exp/tri5a/graph/HCLG.fst \
+                        --trans_model aux/final.mdl \
+                        --log_prior aux/logprior \
+                        --vocabulary aux/graph/words.txt \
+                        --graphs aux/graph/HCLG.fst \
                        --acoustic_scale 0.059 \
                        --parallel
--- a/fluid/DeepASR/examples/aishell/prepare_data.sh
+++ b/fluid/DeepASR/examples/aishell/prepare_data.sh
 data_dir=~/.cache/paddle/dataset/speech/deep_asr_data/aishell
 data_url='http://deep-asr-data.gz.bcebos.com/aishell_data.tar.gz'
 lst_url='http://deep-asr-data.gz.bcebos.com/aishell_lst.tar.gz'
+aux_url='http://deep-asr-data.gz.bcebos.com/aux.tar.gz'
 md5=17669b8d63331c9326f4a9393d289bfb
+aux_md5=50e3125eba1e3a2768a6f2e499cc1749

 if [ ! -e $data_dir ]; then
    mkdir -p $data_dir
@@ -35,3 +37,7 @@ wget -c -P data $lst_url
 tar xvf data/aishell_lst.tar.gz -C data

 ln -s $data_dir data/aishell
+
+echo "Download and untar aux files ..."
+wget -c $aux_url
+tar xvf aux.tar.gz 
--- a/fluid/DeepASR/examples/aishell/score_cer.sh
+++ b/fluid/DeepASR/examples/aishell/score_cer.sh
-ref_txt=data/text.test
+ref_txt=aux/test.ref.txt
 hyp_txt=decoding_result.txt

 python ../../score_error_rate.py --error_rate_type cer --ref $ref_txt --hyp $hyp_txt
--- a/fluid/DeepASR/score_error_rate.py
+++ b/fluid/DeepASR/score_error_rate.py
@@ -16,10 +16,18 @@ def parse_args():
        default='cer',
        choices=['cer', 'wer'],
        help="Error rate type. (default: %(default)s)")
+    parser.add_argument(
+        '--special_tokens',
+        type=str,
+        default='<SPOKEN_NOISE>',
+        help="Special tokens in scoring CER, seperated by space. "
+        "They shouldn't be splitted and should be treated as one special "
+        "character. Example: '<SPOKEN_NOISE> <bos> <eos>' "
+        "(default: %(default)s)")
    parser.add_argument(
        '--ref', type=str, required=True, help="The ground truth text.")
    parser.add_argument(
-        '--hyp', type=str, required=True, help="The decoding result.")
+        '--hyp', type=str, required=True, help="The decoding result text.")
    args = parser.parse_args()
    return args

@@ -31,6 +39,8 @@ if __name__ == '__main__':
    sum_errors, sum_ref_len = 0.0, 0
    sent_cnt, not_in_ref_cnt = 0, 0

+    special_tokens = args.special_tokens.split(" ")
+
    with open(args.ref, "r") as ref_txt:
        line = ref_txt.readline()
        while line:
@@ -51,6 +61,8 @@ if __name__ == '__main__':
                continue

            if args.error_rate_type == 'cer':
+                for sp_tok in special_tokens:
+                    sent = sent.replace(sp_tok, '\0')
                errors, ref_len = char_errors(
                    ref_dict[key].decode("utf8"),
                    sent.decode("utf8"),

--- a/fluid/README.cn.md
+++ b/fluid/README.cn.md
 # models 简介

-
-
 ## 图像分类

 图像分类是根据图像的语义信息对不同类别图像进行区分，是计算机视觉中重要的基础问题，是物体检测、图像分割、物体跟踪、行为分析、人脸识别等其他高层视觉任务的基础，在许多领域都有着广泛的应用。如：安防领域的人脸识别和智能视频分析等，交通领域的交通场景识别，互联网领域基于内容的图像检索和相册自动归类，医学领域的图像识别等。
@@ -22,9 +20,12 @@

 目标检测任务的目标是给定一张图像或是一个视频帧，让计算机找出其中所有目标的位置，并给出每个目标的具体类别。对于人类来说，目标检测是一个非常简单的任务。然而，计算机能够“看到”的是图像被编码之后的数字，很难解图像或是视频帧中出现了人或是物体这样的高层语义概念，也就更加难以定位目标出现在图像中哪个区域。与此同时，由于目标会出现在图像或是视频帧中的任何位置，目标的形态千变万化，图像或是视频帧的背景千差万别，诸多因素都使得目标检测对计算机来说是一个具有挑战性的问题。

-在目标检测任务中，我们介绍了如何基于[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[MS COCO](http://cocodataset.org/#home)数据的训练目标检测算法SSD，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点，并开源了训练好的[MobileNet-SSD模型](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md#模型发布)。
+在目标检测任务中，我们介绍了如何基于[PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/)、[MS COCO](http://cocodataset.org/#home)数据训练通用物体检测模型，当前介绍了SSD算法，SSD全称Single Shot MultiBox Detector，是目标检测领域较新且效果较好的检测算法之一，具有检测速度快且检测精度高的特点。
+
+开放环境中的检测人脸，尤其是小的、模糊的和部分遮挡的人脸也是一个具有挑战的任务。我们也介绍了如何基于[WIDER FACE](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/)数据训练百度自研的人脸检测PyramidBox模型，该算法于2018年3月份在WIDER FACE的多项评测中均获得[第一名](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/WiderFace_Results.html)。

 - [Single Shot MultiBox Detector](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md)
+- [Face Detector: PyramidBox](https://github.com/PaddlePaddle/models/tree/develop/fluid/face_detection/README_cn.md)


 ## 图像语义分割

--- a/fluid/face_detection/.gitignore
+++ b/fluid/face_detection/.gitignore
 model/
-pretrained/
-data/
-label/
+data/WIDER_train
+data/WIDER_val
+data/wider_face_split
+vgg_ilsvrc_16_fc_reduced*
 *.swp
 *.log
 log*
 output*
-infer_results*
+pred
+eval_tools
--- a/fluid/face_detection/README.md
+++ b/fluid/face_detection/README.md
+README_cn.md
\ No newline at end of file
--- a/fluid/face_detection/README_cn.md
+++ b/fluid/face_detection/README_cn.md
+运行本目录下的程序示例需要使用 PaddlePaddle 最新的 develop branch 版本。如果您的 PaddlePaddle 安装版本低于此要求，请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新 PaddlePaddle 安装版本。
+
+---
+
+
+## Pyramidbox 人脸检测
+
+## Table of Contents
+- [简介](#简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [模型评估](#模型评估)
+- [模型发布](#模型发布)
+
+### 简介
+
+人脸检测是经典的计算机视觉任务，非受控场景中的小脸、模糊和遮挡的人脸检测是这个方向上最有挑战的问题。[PyramidBox](https://arxiv.org/pdf/1803.07737.pdf) 是一种基于SSD的单阶段人脸检测器，它利用上下文信息解决困难人脸的检测问题。如下图所示，PyramidBox在六个尺度的特征图上进行不同层级的预测。该工作主要包括以下模块：LFPN、Pyramid Anchors、CPM、Data-anchor-sampling。具体可以参考该方法对应的论文 https://arxiv.org/pdf/1803.07737.pdf ，下面进行简要的介绍。
+
+<p align="center">
+<img src="images/architecture_of_pyramidbox.jpg" height=316 width=415 hspace='10'/> <br />
+Pyramidbox 人脸检测模型
+</p>
+
+**LFPN**: LFPN全称Low-level Feature Pyramid Networks, 在检测任务中，LFPN可以充分结合高层次的包含更多上下文的特征和低层次的包含更多纹理的特征。高层级特征被用于检测尺寸较大的人脸，而低层级特征被用于检测尺寸较小的人脸。为了将高层级特征整合到高分辨率的低层级特征上，我们从中间层开始做自上而下的融合，构建Low-level FPN。
+
+**Pyramid Anchors**: 该算法使用半监督解决方案来生成与人脸检测相关的具有语义的近似标签，提出基于anchor的语境辅助方法，它引入有监督的信息来学习较小的、模糊的和部分遮挡的人脸的语境特征。使用者可以根据标注的人脸标签，按照一定的比例扩充，得到头部的标签（上下左右各扩充1/2）和人体的标签（可自定义扩充比例）。
+
+**CPM**: CPM全称Context-sensitive Predict Module, 本方法设计了一种上下文敏感结构(CPM)来提高预测网络的表达能力。
+
+**Data-anchor-sampling**: 设计了一种新的采样方法，称作Data-anchor-sampling，该方法可以增加训练样本在不同尺度上的多样性。该方法改变训练样本的分布，重点关注较小的人脸。
+
+Pyramidbox模型可以在以下示例图片上展示鲁棒的检测性能，该图有一千张人脸，该模型检测出其中的880张人脸。
+<p align="center">
+<img src="images/demo_img.jpg" height=255 width=455 hspace='10'/> <br />
+Pyramidbox 人脸检测性能展示
+</p>
+
+
+
+### 数据准备
+
+本教程使用 [WIDER FACE 数据集](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/) 来进行模型的训练测试工作，官网给出了详尽的数据介绍。
+
+WIDER FACE数据集包含32,203张图片，其中包含393,703个人脸，数据集的人脸在尺度、姿态、遮挡方面有较大的差异性。另外WIDER FACE数据集是基于61个场景归类的，然后针对每个场景，随机的挑选40%作为训练集，10%作为验证集，50%作为测试集。
+
+首先，从官网训练集和验证集，放在`data`目录，官网提供了谷歌云和百度云下载地址，请依据情况自行下载。并下载训练集和验证集的标注信息:
+
+```bash
+./data/download.sh
+```
+
+准备好数据之后，`data`目录如下：
+
+```
+data
+|-- download.sh
+|-- wider_face_split
+|   |-- readme.txt
+|   |-- wider_face_train_bbx_gt.txt
+|   |-- wider_face_val_bbx_gt.txt
+|   `-- ...
+|-- WIDER_train
+|   `-- images
+|       |-- 0--Parade
+|       ...
+|       `-- 9--Press_Conference
+`-- WIDER_val
+    `-- images
+        |-- 0--Parade
+        ...
+        `-- 9--Press_Conference
+```
+
+
+### 模型训练
+
+#### 下载预训练模型
+
+我们提供了预训练模型，模型是基于VGGNet的主干网络，使用如下命令下载：
+
+
+```bash
+wget http://paddlemodels.bj.bcebos.com/vgg_ilsvrc_16_fc_reduced.tar.gz
+tar -xf vgg_ilsvrc_16_fc_reduced.tar.gz && rm -f vgg_ilsvrc_16_fc_reduced.tar.gz
+```
+
+声明：该预训练模型转换自[Caffe](http://cs.unc.edu/~wliu/projects/ParseNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel)。不久，我们会发布自己预训练的模型。
+
+
+#### 开始训练
+
+
+`train.py` 是训练模块的主要执行程序，调用示例如下：
+
+```bash
+python -u train.py --batch_size=16 --pretrained_model=vgg_ilsvrc_16_fc_reduced
+```
+  - 可以通过设置 `export CUDA_VISIBLE_DEVICES=0,1,2,3` 指定想要使用的GPU数量。
+  - 更多的可选参数见:
+    ```bash
+    python train.py --help
+    ```
+
+模型训练所采用的数据增强：
+
+**数据增强**：数据的读取行为定义在 `reader.py` 中，所有的图片都会被缩放到640x640。在训练时还会对图片进行数据增强，包括随机扰动、翻转、裁剪等，和[物体检测SSD算法](https://github.com/PaddlePaddle/models/blob/develop/fluid/object_detection/README_cn.md#%E8%AE%AD%E7%BB%83-pascal-voc-%E6%95%B0%E6%8D%AE%E9%9B%86)中数据增强类似，除此之外，增加了上面提到的Data-anchor-sampling:
+
+  **尺度变换(Data-anchor-sampling)**：随机将图片尺度变换到一定范围的尺度，大大增强人脸的尺度变化。具体操作为根据随机选择的人脸高(height)和宽(width)，得到$v=\\sqrt{width * height}$，判断$v$的值位于缩放区间$[16，32，64，128，256，512]$中的的哪一个。假设$v=45$，则选定$32<v<64$，以均匀分布的概率选取$[16，32，64]$中的任意一个值。若选中$64$，则该人脸的缩放区间在 $[64 / 2，min(v * 2, 64 * 2)]$中选定。
+
+
+
+**注意**：
+  - 本次开源模型中CPM模块与论文中有些许不同，相比论文中CPM模块训练和测试速度更快。
+  - Pyramid Anchors模块的body部分可以针对不同情况，进行相应的长宽设置来调参。同时face、head、body部分的loss对应的系数也可以通过调参优化。
+
+
+### 模型评估
+
+验证集的评估需要两个步骤：先预测出验证集的检测框和置信度，再利用WIDER FACE官方提供的评估脚本得到评估结果。
+
+- 预测检测结果
+
+  ```bash
+  python -u widerface_eval.py --model_dir=output/159 --pred_dir=pred
+  ```
+  更多的可选参数:
+
+  ```bash
+  python -u widerface_eval.py --help
+  ```
+  **注意**： `widerface_eval.py`中`multi_scale_test_pyramid`可用可不用，由于Data-anchor-sampling的作用，更加密集的anchors对性能有更大的提升。
+
+- 评估AP指标
+
+  下载官方评估脚本，评估average precision(AP)指标：
+
+  ```bash
+  wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/eval_script/eval_tools.zip
+  unzip eval_tools.zip && rm -f eval_tools.zip
+  ```
+
+  修改`eval_tools/wider_eval.m`中检测结果保存的路径和将要画出的曲线名称：
+
+  ```txt
+  # 此处修改存放结果的文件夹名字
+  pred_dir = './pred';  
+  # 此处修改将要画出的曲线名称
+  legend_name = 'Fluid-PyramidBox';
+  ```
+
+  `wider_eval.m`是评估模块的主要执行程序，命令行式的运行命令如下：
+
+  ```bash
+  matlab -nodesktop -nosplash -nojvm -r "run wider_eval.m;quit;"
+  ```
+
+### 模型发布
+
+
+
+| 模型                    | 预训练模型  | 训练数据    | 测试数据    | mAP |
+|:------------------------:|:------------------:|:----------------:|:------------:|:----:|
+|[Pyramidbox-v1-SSD 640x640]() | [VGGNet](http://paddlemodels.bj.bcebos.com/vgg_ilsvrc_16_fc_reduced.tar.gz) | WIDER FACE train | WIDER FACE Val   | 95.6%/ 94.7%/ 89.3%  |
+
+#### 性能曲线
+<p align="center">
+    <img src="images/wider_pr_cruve_int_easy_val.jpg" width="280" />
+    <img src="images/wider_pr_cruve_int_medium_val.jpg" width="280" />
+    <img src="images/wider_pr_cruve_int_hard_val.jpg" width="280" /></br>
+WIDER FACE Easy/Medium/Hard set
+</p>
+
+> 目前，基于PaddlePaddle的实现过程中模型参数仍在调优，比上图更优的结果会在后续发布
--- a/fluid/face_detection/data/download.sh
+++ b/fluid/face_detection/data/download.sh
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+echo "Downloading..."
+wget http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/support/bbx_annotation/wider_face_split.zip
+
+echo "Extracting..."
+unzip wider_face_split.zip && rm -f wider_face_split.zip
--- a/fluid/face_detection/image_util.py
+++ b/fluid/face_detection/image_util.py
@@ -131,12 +131,13 @@ def data_anchor_sampling(sampler, bbox_labels, image_width, image_height,
            rand_idx_size = range_size + 1
        else:
            # np.random.randint range: [low, high)
-            rng_rand_size = np.random.randint(0, range_size)
-            rand_idx_size = rng_rand_size % range_size
-
-        scale_choose = random.uniform(scale_array[rand_idx_size] / 2.0,
-                                      2.0 * scale_array[rand_idx_size])
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)

+        min_resize_val = scale_array[rand_idx_size] / 2.0
+        max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                             2 * math.sqrt(wid * hei))
+        scale_choose = random.uniform(min_resize_val, max_resize_val)
        sample_bbox_size = wid * resize_width / scale_choose

        w_off_orig = 0.0
@@ -389,9 +390,19 @@ def crop_image_sampling(img, bbox_labels, sample_bbox, image_width,
    roi_width = cross_width
    roi_height = cross_height

+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
    sample_img = np.zeros((height, width, 3))
-    sample_img[int(roi_ymin) : int(roi_ymin + roi_height), int(roi_xmin) : int(roi_xmin + roi_width)] = \
-        img[int(cross_ymin) : int(cross_ymin + cross_height), int(cross_xmin) : int(cross_xmin + cross_width)]
+    sample_img[roi_y1 : roi_y2, roi_x1 : roi_x2] = \
+        img[cross_y1 : cross_y2, cross_x1 : cross_x2]

    sample_img = cv2.resize(
        sample_img, (resize_width, resize_height), interpolation=cv2.INTER_AREA)

--- a/fluid/face_detection/images/architecture_of_pyramidbox.jpg
+++ b/fluid/face_detection/images/architecture_of_pyramidbox.jpg
--- a/fluid/face_detection/images/demo_img.jpg
+++ b/fluid/face_detection/images/demo_img.jpg
--- a/fluid/face_detection/images/wider_pr_cruve_int_easy_val.jpg
+++ b/fluid/face_detection/images/wider_pr_cruve_int_easy_val.jpg
--- a/fluid/face_detection/images/wider_pr_cruve_int_hard_val.jpg
+++ b/fluid/face_detection/images/wider_pr_cruve_int_hard_val.jpg
--- a/fluid/face_detection/images/wider_pr_cruve_int_medium_val.jpg
+++ b/fluid/face_detection/images/wider_pr_cruve_int_medium_val.jpg
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -52,7 +52,7 @@ def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True):
 class PyramidBox(object):
    def __init__(self,
                 data_shape,
-                 num_classes,
+                 num_classes=None,
                 use_transposed_conv2d=True,
                 is_infer=False,
                 sub_network=False):
@@ -414,5 +414,5 @@ class PyramidBox(object):
                nms_threshold=0.3,
                nms_top_k=5000,
                keep_top_k=750,
-                score_threshold=0.05)
+                score_threshold=0.01)
        return test_program, face_nmsed_out
--- a/fluid/face_detection/reader.py
+++ b/fluid/face_detection/reader.py
@@ -59,30 +59,25 @@ class Settings(object):
        self.saturation_delta = 0.5
        self.brightness_prob = 0.5
        # _brightness_delta is the normalized value by 256
-        # self._brightness_delta = 32
        self.brightness_delta = 0.125
        self.scale = 0.007843  # 1 / 127.5
        self.data_anchor_sampling_prob = 0.5
        self.min_face_size = 8.0


-def draw_image(faces_pred, img, resize_val):
-    for i in range(len(faces_pred)):
-        draw_rotate_rectange(img, faces_pred[i], resize_val, (0, 255, 0), 3)
-
-
-def draw_rotate_rectange(img, face, resize_val, color, thickness):
-    cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
-        face[3] * resize_val), int(face[2] * resize_val)), color, thickness)
-
-    cv2.line(img, (int(face[3] * resize_val), int(face[2] * resize_val)), (int(
-        face[3] * resize_val), int(face[4] * resize_val)), color, thickness)
-
-    cv2.line(img, (int(face[1] * resize_val), int(face[2] * resize_val)), (int(
-        face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
-
-    cv2.line(img, (int(face[3] * resize_val), int(face[4] * resize_val)), (int(
-        face[1] * resize_val), int(face[4] * resize_val)), color, thickness)
+def to_chw_bgr(image):
+    """
+    Transpose image from HWC to CHW and from RBG to BGR.
+    Args:
+        image (np.array): an image with HWC and RBG layout.
+    """
+    # HWC to CHW
+    if len(image.shape) == 3:
+        image = np.swapaxes(image, 1, 2)
+        image = np.swapaxes(image, 1, 0)
+    # RBG to BGR
+    image = image[[2, 1, 0], :, :]
+    return image


 def preprocess(img, bbox_labels, mode, settings, image_path):
@@ -108,9 +103,6 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
                batch_sampler, bbox_labels, img_width, img_height, scale_array,
                settings.resize_width, settings.resize_height)
            img = np.array(img)
-            # Debug
-            # img_save = Image.fromarray(img)
-            # img_save.save('img_orig.jpg')
            if len(sampled_bbox) > 0:
                idx = int(random.uniform(0, len(sampled_bbox)))
                img, sampled_labels = image_util.crop_image_sampling(
@@ -119,17 +111,7 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
                    settings.min_face_size)

            img = img.astype('uint8')
-            # Debug: visualize the gt bbox
-            visualize_bbox = 0
-            if visualize_bbox:
-                img_show = img
-                draw_image(sampled_labels, img_show, settings.resize_height)
-                img_show = Image.fromarray(img_show)
-                img_show.save('final_img_show.jpg')
-
            img = Image.fromarray(img)
-            # Debug
-            # img.save('final_img.jpg')

        else:
            # hard-code here
@@ -173,12 +155,8 @@ def preprocess(img, bbox_labels, mode, settings, image_path):
                tmp = sampled_labels[i][1]
                sampled_labels[i][1] = 1 - sampled_labels[i][3]
                sampled_labels[i][3] = 1 - tmp
-    # HWC to CHW
-    if len(img.shape) == 3:
-        img = np.swapaxes(img, 1, 2)
-        img = np.swapaxes(img, 1, 0)
-    # RBG to BGR
-    img = img[[2, 1, 0], :, :]
+
+    img = to_chw_bgr(img)
    img = img.astype('float32')
    img -= settings.img_mean
    img = img * settings.scale
@@ -192,25 +170,24 @@ def load_file_list(input_txt):
    file_dict = {}
    num_class = 0
    for i in range(len(lines_input_txt)):
-        tmp_line_txt = lines_input_txt[i].strip('\n\t\r')
-        if '--' in tmp_line_txt:
+        line_txt = lines_input_txt[i].strip('\n\t\r')
+        if '--' in line_txt:
            if i != 0:
                num_class += 1
            file_dict[num_class] = []
-            dict_name = tmp_line_txt
-            file_dict[num_class].append(tmp_line_txt)
-        if '--' not in tmp_line_txt:
-            if len(tmp_line_txt) > 6:
-                split_str = tmp_line_txt.split(' ')
+            file_dict[num_class].append(line_txt)
+        if '--' not in line_txt:
+            if len(line_txt) > 6:
+                split_str = line_txt.split(' ')
                x1_min = float(split_str[0])
                y1_min = float(split_str[1])
                x2_max = float(split_str[2])
                y2_max = float(split_str[3])
-                tmp_line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
+                line_txt = str(x1_min) + ' ' + str(y1_min) + ' ' + str(
                    x2_max) + ' ' + str(y2_max)
-                file_dict[num_class].append(tmp_line_txt)
+                file_dict[num_class].append(line_txt)
            else:
-                file_dict[num_class].append(tmp_line_txt)
+                file_dict[num_class].append(line_txt)

    return file_dict

@@ -248,7 +225,7 @@ def train_generator(settings, file_list, batch_size, shuffle=True):
        label_offs = [0]

        for index_image in file_dict.keys():
-            image_name = file_dict[index_image][0] + '.jpg'
+            image_name = file_dict[index_image][0]
            image_path = os.path.join(settings.data_dir, image_name)
            im = Image.open(image_path)
            if im.mode == 'L':
@@ -331,7 +308,7 @@ def test(settings, file_list):

    def reader():
        for index_image in file_dict.keys():
-            image_name = file_dict[index_image][0] + '.jpg'
+            image_name = file_dict[index_image][0]
            image_path = os.path.join(settings.data_dir, image_name)
            im = Image.open(image_path)
            if im.mode == 'L':
@@ -351,12 +328,7 @@ def infer(settings, image_path):
            img = img.resize((settings.resize_width, settings.resize_height),
                             Image.ANTIALIAS)
        img = np.array(img)
-        # HWC to CHW
-        if len(img.shape) == 3:
-            img = np.swapaxes(img, 1, 2)
-            img = np.swapaxes(img, 1, 0)
-        # RBG to BGR
-        img = img[[2, 1, 0], :, :]
+        img = to_chw_bgr(img)
        img = img.astype('float32')
        img -= settings.img_mean
        img = img * settings.scale

--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -5,27 +5,26 @@ import time
 import argparse
 import functools

-import reader
-import paddle
 import paddle.fluid as fluid
 from pyramidbox import PyramidBox
+import reader
 from utility import add_arguments, print_arguments

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)

 # yapf: disable
-add_arg('parallel',         bool,  True,            "parallel")
-add_arg('learning_rate',    float, 0.001,           "Learning rate.")
-add_arg('batch_size',       int,   12,              "Minibatch size.")
+add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
+add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
+add_arg('batch_size',       int,   16,              "Minibatch size.")
 add_arg('num_passes',       int,   160,             "Epoch number.")
 add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
 add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
 add_arg('model_save_dir',   str,   'output',        "The path to save model.")
-add_arg('pretrained_model', str,   './pretrained/', "The init model path.")
 add_arg('resize_h',         int,   640,             "The resized image height.")
-add_arg('resize_w',         int,   640,             "The resized image height.")
-add_arg('with_mem_opt',     bool,  False,           "Whether to use memory optimization or not.")
+add_arg('resize_w',         int,   640,             "The resized image width.")
+add_arg('with_mem_opt',     bool,  True,            "Whether to use memory optimization or not.")
+add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
 #yapf: enable


@@ -145,7 +144,7 @@ def train(args, config, train_file_list, optimizer_method):
                                     fetch_list=fetches)
            end_time = time.time()
            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
-            if batch_id % 1 == 0:
+            if batch_id % 10 == 0:
                if not args.use_pyramidbox:
                    print("Pass {0}, batch {1}, loss {2}, time {3}".format(
                        pass_id, batch_id, fetch_vars[0],
@@ -164,8 +163,8 @@ if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)

-    data_dir = 'data/WIDERFACE/WIDER_train/images/'
-    train_file_list = 'label/train_gt_widerface.res'
+    data_dir = 'data/WIDER_train/images/'
+    train_file_list = 'data/wider_face_split/wider_face_train_bbx_gt.txt'

    config = reader.Settings(
        data_dir=data_dir,

--- a/fluid/face_detection/visualize.py
+++ b/fluid/face_detection/visualize.py
+import os
+from PIL import Image
+from PIL import ImageDraw
+
+
+def draw_bbox(image, bbox):
+    """
+    Draw one bounding box on image.
+    Args:
+        image (PIL.Image): a PIL Image object.
+        bbox (np.array|list|tuple): (xmin, ymin, xmax, ymax).
+    """
+    draw = ImageDraw.Draw(image)
+    xmin, ymin, xmax, ymax = box
+    (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+    draw.line(
+        [(left, top), (left, bottom), (right, bottom), (right, top),
+         (left, top)],
+        width=4,
+        fill='red')
+
+
+def draw_bboxes(image_file, bboxes, labels=None, output_dir=None):
+    """
+    Draw bounding boxes on image.
+    
+    Args:
+        image_file (string): input image path.
+        bboxes (np.array): bounding boxes.
+        labels (list of string): the label names of bboxes.
+        output_dir (string): output directory.
+    """
+    if labels:
+        assert len(bboxes) == len(labels)
+
+    image = Image.open(image_file)
+    draw = ImageDraw.Draw(image)
+    for i in range(len(bboxes)):
+        xmin, ymin, xmax, ymax = bboxes[i]
+        (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+        draw.line(
+            [(left, top), (left, bottom), (right, bottom), (right, top),
+             (left, top)],
+            width=4,
+            fill='red')
+        if labels and image.mode == 'RGB':
+            draw.text((left, top), labels[i], (255, 255, 0))
+
+    output_file = image_file.split('/')[-1]
+    if output_dir:
+        output_file = os.path.join(output_dir, output_file)
+
+    print("The image with bbox is saved as {}".format(output_file))
+    image.save(output_file)
--- a/fluid/face_detection/infer.py
+++ b/fluid/face_detection/infer.py
@@ -4,68 +4,130 @@ import numpy as np
 import argparse
 import functools
 from PIL import Image
-from PIL import ImageDraw

-import paddle
 import paddle.fluid as fluid
 import reader
 from pyramidbox import PyramidBox
 from utility import add_arguments, print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
+
 # yapf: disable
-add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
-add_arg('use_pyramidbox',   bool,  True, "Whether use PyramidBox model.")
-add_arg('confs_threshold',  float, 0.25,    "Confidence threshold to draw bbox.")
-add_arg('image_path',       str,   '',        "The data root path.")
-add_arg('model_dir',        str,   '',     "The model path.")
+add_arg('use_gpu',        bool, True,                              "Whether use GPU or not.")
+add_arg('use_pyramidbox', bool, True,                              "Whether use PyramidBox model.")
+add_arg('data_dir',       str,  'data/WIDER_val/images/',          "The validation dataset path.")
+add_arg('model_dir',      str,  '',                                "The model path.")
+add_arg('pred_dir',       str,  'pred',                            "The path to save the evaluation results.")
+add_arg('file_list',      str,  'data/wider_face_split/wider_face_val_bbx_gt.txt', "The validation dataset path.")
 # yapf: enable


-def draw_bounding_box_on_image(image_path, nms_out, confs_threshold):
-    image = Image.open(image_path)
-    draw = ImageDraw.Draw(image)
-    for dt in nms_out:
-        xmin, ymin, xmax, ymax, score = dt
-        if score < confs_threshold:
-            continue
-        (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
-        draw.line(
-            [(left, top), (left, bottom), (right, bottom), (right, top),
-             (left, top)],
-            width=4,
-            fill='red')
-    image_name = image_path.split('/')[-1]
-    image_class = image_path.split('/')[-2]
-    print("image with bbox drawed saved as {}".format(image_name))
-    image.save('./infer_results/' + image_class.encode('utf-8') + '/' +
-               image_name.encode('utf-8'))
+def infer(args, config):
+    batch_size = 1
+    model_dir = args.model_dir
+    data_dir = args.data_dir
+    file_list = args.file_list
+    pred_dir = args.pred_dir
+
+    if not os.path.exists(model_dir):
+        raise ValueError("The model path [%s] does not exist." % (model_dir))
+
+    test_reader = reader.test(config, file_list)
+
+    for image, image_path in test_reader():
+        shrink, max_shrink = get_shrink(image.size[1], image.size[0])
+
+        det0 = detect_face(image, shrink)
+        det1 = flip_test(image, shrink)
+        [det2, det3] = multi_scale_test(image, max_shrink)
+        det4 = multi_scale_test_pyramid(image, max_shrink)
+        det = np.row_stack((det0, det1, det2, det3, det4))
+        dets = bbox_vote(det)
+
+        save_widerface_bboxes(image_path, dets, pred_dir)

+    print("Finish evaluation.")

-def write_to_txt(image_path, f, nms_out):
+
+def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
+    """
+    Save predicted results, including bbox and score into text file.
+    Args:
+        image_path (string): file name.
+        bboxes_scores (np.array|list): the predicted bboxed and scores, layout
+            is (xmin, ymin, xmax, ymax, score)
+        output_dir (string): output directory.
+    """
    image_name = image_path.split('/')[-1]
    image_class = image_path.split('/')[-2]
-    f.write('{:s}\n'.format(
-        image_class.encode('utf-8') + '/' + image_name.encode('utf-8')))
-    f.write('{:d}\n'.format(nms_out.shape[0]))
-    for dt in nms_out:
-        xmin, ymin, xmax, ymax, score = dt
+
+    image_name = image_name.encode('utf-8')
+    image_class = image_class.encode('utf-8')
+
+    odir = os.path.join(output_dir, image_class)
+    if not os.path.exists(odir):
+        os.makedirs(odir)
+
+    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
+    f = open(ofname, 'w')
+    f.write('{:s}\n'.format(image_class + '/' + image_name))
+    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
+    for box_score in bboxes_scores:
+        xmin, ymin, xmax, ymax, score = box_score
        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
            xmax - xmin + 1), (ymax - ymin + 1), score))
-    print("image infer result saved {}".format(image_name[:-4]))
+    f.close()
+    print("The predicted result is saved as {}".format(ofname))
+
+
+def detect_face(image, shrink):
+    image_shape = [3, image.size[1], image.size[0]]
+    if shrink != 1:
+        h, w = int(image_shape[1] * shrink), int(image_shape[2] * shrink)
+        image = image.resize((w, h), Image.ANTIALIAS)
+        image_shape = [3, h, w]

+    img = np.array(image)
+    img = reader.to_chw_bgr(img)
+    mean = [104., 117., 123.]
+    scale = 0.007843
+    img = img.astype('float32')
+    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
+    img = img * scale
+    img = [img]
+    img = np.array(img)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    main_program = fluid.Program()
+    startup_program = fluid.Program()
+
+    with fluid.unique_name.guard():
+        with fluid.program_guard(main_program, startup_program):
+            network = PyramidBox(
+                image_shape, sub_network=args.use_pyramidbox, is_infer=True)
+            infer_program, nmsed_out = network.infer(main_program)
+            fetches = [nmsed_out]
+            fluid.io.load_persistables(
+                exe, args.model_dir, main_program=main_program)
+
+            detection, = exe.run(infer_program,
+                                 feed={'image': img},
+                                 fetch_list=fetches,
+                                 return_numpy=False)
+            detection = np.array(detection)
+    # layout: xmin, ymin, xmax. ymax, score
+    if detection.shape == (1, ):
+        print("No face detected")
+        return np.array([[0, 0, 0, 0, 0]])
+    det_conf = detection[:, 1]
+    det_xmin = image_shape[2] * detection[:, 2] / shrink
+    det_ymin = image_shape[1] * detection[:, 3] / shrink
+    det_xmax = image_shape[2] * detection[:, 4] / shrink
+    det_ymax = image_shape[1] * detection[:, 5] / shrink

-def get_round(x, loc):
-    str_x = str(x)
-    if '.' in str_x:
-        len_after = len(str_x.split('.')[1])
-        str_before = str_x.split('.')[0]
-        str_after = str_x.split('.')[1]
-        if len_after >= 3:
-            str_final = str_before + '.' + str_after[0:loc]
-            return float(str_final)
-        else:
-            return x
+    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
+    return det


 def bbox_vote(det):
@@ -86,7 +148,7 @@ def bbox_vote(det):
        inter = w * h
        o = inter / (area[0] + area[:] - inter)

-        # get needed merge det and delete these det
+        # nms
        merge_index = np.where(o >= 0.3)[0]
        det_accu = det[merge_index, :]
        det = np.delete(det, merge_index, 0)
@@ -111,78 +173,6 @@ def bbox_vote(det):
    return dets


-def image_preprocess(image):
-    img = np.array(image)
-    # HWC to CHW
-    if len(img.shape) == 3:
-        img = np.swapaxes(img, 1, 2)
-        img = np.swapaxes(img, 1, 0)
-    # RBG to BGR
-    img = img[[2, 1, 0], :, :]
-    img = img.astype('float32')
-    img -= np.array(
-        [104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
-    img = img * 0.007843
-    img = [img]
-    img = np.array(img)
-    return img
-
-
-def detect_face(image, shrink):
-    image_shape = [3, image.size[1], image.size[0]]
-    num_classes = 2
-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    if shrink != 1:
-        image = image.resize((int(image_shape[2] * shrink),
-                              int(image_shape[1] * shrink)), Image.ANTIALIAS)
-        image_shape = [
-            image_shape[0], int(image_shape[1] * shrink),
-            int(image_shape[2] * shrink)
-        ]
-    print "image_shape:", image_shape
-    img = image_preprocess(image)
-
-    scope = fluid.core.Scope()
-    main_program = fluid.Program()
-    startup_program = fluid.Program()
-
-    with fluid.scope_guard(scope):
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main_program, startup_program):
-                fetches = []
-                network = PyramidBox(
-                    image_shape,
-                    num_classes,
-                    sub_network=args.use_pyramidbox,
-                    is_infer=True)
-                infer_program, nmsed_out = network.infer(main_program)
-                fetches = [nmsed_out]
-                fluid.io.load_persistables(
-                    exe, args.model_dir, main_program=main_program)
-
-                detection, = exe.run(infer_program,
-                                     feed={'image': img},
-                                     fetch_list=fetches,
-                                     return_numpy=False)
-                detection = np.array(detection)
-    # layout: xmin, ymin, xmax. ymax, score
-    if detection.shape == (1, ):
-        print("No face detected")
-        return np.array([[0, 0, 0, 0, 0]])
-    det_conf = detection[:, 1]
-    det_xmin = image_shape[2] * detection[:, 2] / shrink
-    det_ymin = image_shape[1] * detection[:, 3] / shrink
-    det_xmax = image_shape[2] * detection[:, 4] / shrink
-    det_ymax = image_shape[1] * detection[:, 5] / shrink
-
-    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
-    keep_index = np.where(det[:, 4] >= 0)[0]
-    det = det[keep_index, :]
-    return det
-
-
 def flip_test(image, shrink):
    img = image.transpose(Image.FLIP_LEFT_RIGHT)
    det_f = detect_face(img, shrink)
@@ -197,18 +187,18 @@ def flip_test(image, shrink):


 def multi_scale_test(image, max_shrink):
-    # shrink detecting and shrink only detect big face
+    # Shrink detecting is only used to detect big faces
    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
    det_s = detect_face(image, st)
    index = np.where(
        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
        > 30)[0]
    det_s = det_s[index, :]
-    # enlarge one times
+    # Enlarge one times
    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
    det_b = detect_face(image, bt)

-    # enlarge small image x times for small face
+    # Enlarge small image x times for small faces
    if max_shrink > 2:
        bt *= 2
        while bt < max_shrink:
@@ -216,12 +206,13 @@ def multi_scale_test(image, max_shrink):
            bt *= 2
        det_b = np.row_stack((det_b, detect_face(image, max_shrink)))

-    # enlarge only detect small face
+    # Enlarged images are only used to detect small faces.
    if bt > 1:
        index = np.where(
            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
        det_b = det_b[index, :]
+    # Shrinked images are only used to detect big faces.
    else:
        index = np.where(
            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
@@ -231,23 +222,24 @@ def multi_scale_test(image, max_shrink):


 def multi_scale_test_pyramid(image, max_shrink):
-    # shrink detecting and shrink only detect big face
+    # Use image pyramids to detect faces
    det_b = detect_face(image, 0.25)
    index = np.where(
        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
        > 30)[0]
    det_b = det_b[index, :]

-    st = [0.5, 0.75, 1.25, 1.5, 1.75, 2.25]
+    st = [0.75, 1.25, 1.5, 1.75]
    for i in range(len(st)):
        if (st[i] <= max_shrink):
            det_temp = detect_face(image, st[i])
-            # enlarge only detect small face
+            # Enlarged images are only used to detect small faces.
            if st[i] > 1:
                index = np.where(
                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
                det_temp = det_temp[index, :]
+            # Shrinked images are only used to detect big faces.
            else:
                index = np.where(
                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
@@ -257,13 +249,28 @@ def multi_scale_test_pyramid(image, max_shrink):
    return det_b


-def get_im_shrink(image_shape):
-    max_shrink_v1 = (0x7fffffff / 577.0 /
-                     (image_shape[1] * image_shape[2]))**0.5
-    max_shrink_v2 = (
-        (678 * 1024 * 2.0 * 2.0) / (image_shape[1] * image_shape[2]))**0.5
-    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
+def get_shrink(height, width):
+    """
+    Args:
+        height (int): image height.
+        width (int): image width.
+    """
+    # avoid out of memory
+    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
+    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
+
+    def get_round(x, loc):
+        str_x = str(x)
+        if '.' in str_x:
+            str_before, str_after = str_x.split('.')
+            len_after = len(str_after)
+            if len_after >= 3:
+                str_final = str_before + '.' + str_after[0:loc]
+                return float(str_final)
+            else:
+                return x

+    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
    if max_shrink >= 1.5 and max_shrink < 2:
        max_shrink = max_shrink - 0.1
    elif max_shrink >= 2 and max_shrink < 3:
@@ -275,60 +282,12 @@ def get_im_shrink(image_shape):
    elif max_shrink >= 5:
        max_shrink = max_shrink - 0.5

-    print 'max_shrink = ', max_shrink
    shrink = max_shrink if max_shrink < 1 else 1
-    print "shrink = ", shrink
-
    return shrink, max_shrink


-def infer(args, batch_size, data_args):
-    if not os.path.exists(args.model_dir):
-        raise ValueError("The model path [%s] does not exist." %
-                         (args.model_dir))
-
-    infer_reader = paddle.batch(
-        reader.test(data_args, file_list), batch_size=batch_size)
-
-    for batch_id, img in enumerate(infer_reader()):
-        image = img[0][0]
-        image_path = img[0][1]
-
-        # image.size: [width, height]
-        image_shape = [3, image.size[1], image.size[0]]
-
-        shrink, max_shrink = get_im_shrink(image_shape)
-
-        det0 = detect_face(image, shrink)
-        det1 = flip_test(image, shrink)
-        [det2, det3] = multi_scale_test(image, max_shrink)
-        det4 = multi_scale_test_pyramid(image, max_shrink)
-        det = np.row_stack((det0, det1, det2, det3, det4))
-        dets = bbox_vote(det)
-
-        image_name = image_path.split('/')[-1]
-        image_class = image_path.split('/')[-2]
-        if not os.path.exists('./infer_results/' + image_class.encode('utf-8')):
-            os.makedirs('./infer_results/' + image_class.encode('utf-8'))
-
-        f = open('./infer_results/' + image_class.encode('utf-8') + '/' +
-                 image_name.encode('utf-8')[:-4] + '.txt', 'w')
-        write_to_txt(image_path, f, dets)
-        # draw_bounding_box_on_image(image_path, dets, args.confs_threshold)
-    print "Done"
-
-
 if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
-
-    data_dir = 'data/WIDERFACE/WIDER_val/images/'
-    file_list = 'label/val_gt_widerface.res'
-
-    data_args = reader.Settings(
-        data_dir=data_dir,
-        mean_value=[104., 117., 123],
-        apply_distort=False,
-        apply_expand=False,
-        ap_version='11point')
-    infer(args, batch_size=1, data_args=data_args)
+    config = reader.Settings(data_dir=args.data_dir)
+    infer(args, config)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/__init__.py
@@ -8,6 +8,12 @@ import axpy
 import flatten
 import argmax
 import reshape
+import roipooling
+import priorbox
+import permute
+import detection_out
+import normalize
+import select

 #custom layer import ends


--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/detection_out.py
+""" A custom layer for 'detectionout' used in 'SSD' model to produce outputs
+    Note: Since Paddle's implementation of 'detectionout' applied 'flatten' and 'softmax' ops on the input of 'conf', 
+    while Caffe's implementation do not. Hence, you should ajust generated 'ssd.py' to remove 'softmax' and 'flatten' ops applied on 'conf' input.
+"""
+
+from .register import register
+
+
+def detectionoutput_shape(input_shape):
+    """ the output shape of this layer is dynamic and not determined by 'input_shape'
+
+    Args:
+        @input_shape (list of int): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = [-1, 6]
+    return output_shape
+
+
+def detectionoutput_layer(inputs,
+                          name,
+                          background_label=0,
+                          share_location=True,
+                          nms_param=None,
+                          keep_top_k=100,
+                          confidence_threshold=0.1):
+    """ build a layer of type 'detectionout' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    if nms_param is None:
+        nms_param = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0}
+
+    mbox_conf_flatten = inputs[1]
+    mbox_priorbox = inputs[2]
+    mbox_priorbox_list = fluid.layers.split(mbox_priorbox, 2, dim=1)
+    pb = mbox_priorbox_list[0]
+    pbv = mbox_priorbox_list[1]
+    pb = fluid.layers.reshape(x=pb, shape=[-1, 4])
+    pbv = fluid.layers.reshape(x=pbv, shape=[-1, 4])
+    mbox_loc = inputs[0]
+    mbox_loc = fluid.layers.reshape(
+        x=mbox_loc, shape=[-1, mbox_conf_flatten.shape[1], 4])
+
+    default = {"nms_threshold": 0.3, "top_k": 10, "eta": 1.0}
+    fields = ['eta', 'top_k', 'nms_threshold']
+
+    for f in default.keys():
+        if not nms_param.has_key(f):
+            nms_param[f] = default[f]
+
+    nmsed_outs = fluid.layers.detection_output(
+        scores=mbox_conf_flatten,
+        loc=mbox_loc,
+        prior_box=pb,
+        prior_box_var=pbv,
+        background_label=background_label,
+        nms_threshold=nms_param["nms_threshold"],
+        nms_top_k=nms_param["top_k"],
+        keep_top_k=keep_top_k,
+        score_threshold=confidence_threshold,
+        nms_eta=nms_param["eta"])
+
+    return nmsed_outs
+
+
+register(
+    kind='DetectionOutput',
+    shape=detectionoutput_shape,
+    layer=detectionoutput_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/flatten.py
@@ -4,11 +4,6 @@
 from .register import register


-def import_fluid():
-    import paddle.fluid as fluid
-    return fluid
-
-
 def flatten_shape(input_shape, axis=1, end_axis=-1):
    """ calculate the output shape of this layer using input shape

@@ -28,7 +23,7 @@ def flatten_shape(input_shape, axis=1, end_axis=-1):
        start_axis += len(input_shape)

    if end_axis < 0:
-        end_axis += len(input_shape)
+        end_axis += len(input_shape) + 1

    assert start_axis <= end_axis, 'invalid axis[%d] or end_axis[%d] params'\
            % (start_axis, end_axis)
@@ -52,18 +47,16 @@ def flatten_layer(input, name, axis=1, end_axis=-1):
    Returns:
        output (variable): output variable for this layer
    """
-    fluid = import_fluid()
+    import paddle.fluid as fluid

    input_shape = list(input.shape)
-    dims = len(input_shape)
-    start_axis = axis if axis >= 0 else axis + dims
-    end_axis = end_axis if end_axis >= 0 else end_axis + dims

-    assert start_axis <= end_axis, 'invalid axis or end_axis params'
-    output_shape = input_shape[0:start_axis]
-    flat_sz = reduce(lambda a, b: a * b, input_shape[start_axis:end_axis])
-    output_shape += [flat_sz]
-    output_shape += input_shape[end_axis:-1]
+    if input_shape[0] == -1:
+        input_shape[0] = 1
+        output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis)
+        output_shape[0] = -1
+    else:
+        output_shape = flatten_shape(input_shape, axis=axis, end_axis=end_axis)

    output = fluid.layers.reshape(input, shape=output_shape, name=name)


--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/normalize.py
+""" A custom layer for 'normalize' op
+"""
+
+from .register import register
+
+
+def normalize_shape(input_shape,
+                    across_spatial=True,
+                    scale_filler=True,
+                    eps=1e-10):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shape (list of tuples): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = input_shape
+    return output_shape
+
+
+def normalize_layer(input,
+                    name,
+                    across_spatial=True,
+                    scale_filler=True,
+                    channel_shared=False,
+                    eps=1e-10):
+    """ build a layer of type 'normalize' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    param_prefix = name.split('.')[0]
+
+    assert across_spatial == False, "Only support across_spatial == False for Normalize[%s]" % (
+        name)
+    l2_norm = fluid.layers.l2_normalize(input, axis=1)  # l2 norm along channel
+
+    shape = [1] if channel_shared else [input.shape[1]]
+    scale_attr = fluid.ParamAttr(name=param_prefix + '_scale')
+    scale_param = fluid.layers.create_parameter(
+        shape=shape, dtype=input.dtype, name=name, attr=scale_attr)
+
+    out = fluid.layers.elementwise_mul(
+        x=l2_norm, y=scale_param, axis=-1 if channel_shared else 1)
+    return out
+
+
+register(kind='Normalize', shape=normalize_shape, layer=normalize_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/permute.py
+""" A custom layer for 'Permute' which is equivalent to transpose in paddle
+"""
+
+from .register import register
+
+
+def permute_shape(input_shape, order):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shape (list of numbers): input shape
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    output_shape = []
+    for ii in order:
+        assert ii < len(input_shape), "invalid order for permute[%s]" % (name)
+        output_shape.append(input_shape[ii])
+    return output_shape
+
+
+def permute_layer(input, name, order):
+    """ build a layer of type 'permute' using fluid
+
+    Args:
+        @input (input variable): input fluid variables for this layer
+        @name (str): name for this layer
+        @order (list of int): order to permute the dims
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+    output = fluid.layers.transpose(input, order, name=name)
+
+    return output
+
+
+register(kind='Permute', shape=permute_shape, layer=permute_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/priorbox.py
+""" A custom layer for 'priorbox' which is used in ssd to generate prior box info
+    Since the order of prior box is different between caffe and paddle,
+    we use 'slice' and 'concate' ops to align them.
+"""
+
+from .register import register
+
+
+def priorbox_shape(input_shapes, min_size, max_size=None, aspect_ratio=None):
+    """ calculate the output shape of this layer using input shapes
+
+    Args:
+        @input_shapes (list of tuples): a list of input shapes
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    assert len(input_shapes) == 2, "invalid inputs for Priorbox[%s]" % (name)
+    fc_shape = input_shapes[0]
+    N = 1
+    if not max_size == None:
+        N += 1
+    if not aspect_ratio == None:
+        N += 2 * len(aspect_ratio)
+
+    N_bbx = fc_shape[2] * fc_shape[3] * N
+    output_shape = [1, 2, 4 * N_bbx]
+    return output_shape
+
+
+def priorbox_layer(inputs,
+                   name,
+                   min_size,
+                   step,
+                   max_size=None,
+                   aspect_ratio=None,
+                   flip=True,
+                   clip=False,
+                   variance=[],
+                   offset=0.5):
+    """ build a layer of type 'Priorbox' using fluid
+
+    Args:
+        @inputs (list of variables): input fluid variables for this layer
+        @name (str): name for this layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+
+    assert len(inputs) == 2, "invalid inputs for Priorbox[%s]" % (name)
+    input = inputs[0]
+    image = inputs[1]
+    box, variance_ = fluid.layers.prior_box(
+        input,
+        image,
+        min_size,
+        max_size,
+        aspect_ratio,
+        variance,
+        flip,
+        clip, (step, step),
+        offset,
+        min_max_aspect_ratios_order=True)
+    """
+    #adjust layout when the output is not consistent with caffe's
+
+    feat_shape = list(input.shape)
+    H = feat_shape[2]
+    W = feat_shape[3]
+    box_tmp = fluid.layers.reshape(box, [H, W, -1, 4])
+    nb_prior_bbx = int(box_tmp.shape[2])
+    tensor_list = fluid.layers.split(box_tmp, nb_prior_bbx, 2)
+
+    #TODO:
+    #   current implementation for this layer is not efficient
+    #   and we should fix this bug in future when Paddle support the same prior-box layout with Caffe
+    index_list = [0]
+    index_list = index_list * nb_prior_bbx
+    index_offset = 0
+    if max_size is not None:
+        index_list[1] = -1
+        index_offset = 1
+    for ii in xrange(2 * len(aspect_ratio)):
+        index_list[ii + 1 + index_offset] = ii + 1
+
+    tensor_list_gathered = [tensor_list[ii] for ii in index_list]
+    caffe_prior_bbx = fluid.layers.concat(tensor_list_gathered, axis=2)
+    box = fluid.layers.reshape(caffe_prior_bbx, [1, 1, -1])
+    """
+
+    box = fluid.layers.reshape(box, [1, 1, -1])
+    variance_ = fluid.layers.reshape(variance_, [1, 1, -1])
+    output = fluid.layers.concat([box, variance_], axis=1)
+
+    return output
+
+
+register(kind='PriorBox', shape=priorbox_shape, layer=priorbox_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/reshape.py
@@ -68,15 +68,23 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1):
        top_dim = shape['dim'][i]
        if top_dim == 0:
            copy_axes.append(i)
+            copy_axis_index = start_axis + i
+            output_shape[copy_axis_index] = input_shape[copy_axis_index]
        elif top_dim == -1:
            assert inferred_axis == -1, "[Reshape]new shape contains multiple -1 dims"
+            inferred_axis = i
        else:
            constant_count *= top_dim

    if inferred_axis >= 0:
        explicit_count = constant_count
-        explicit_count *= count(input_shape[0:start_axis])
-        explicit_count *= count(input_shape[end_axis:])
+        l = input_shape[0:start_axis]
+        if len(l) > 0:
+            explicit_count *= count(l)
+
+        l = input_shape[end_axis:]
+        if len(l) > 0:
+            explicit_count *= count(l)

        for i in range(len(copy_axes)):
            explicit_count *= output_shape[start_axis + copy_axes[i]]
@@ -84,6 +92,7 @@ def reshape_shape(input_sp, shape, axis=0, num_axes=-1):
        assert input_count % explicit_count == 0, "[Reshape]botom count[%d] "\
                "must be divisible by product of the specified dimensions[%d] "\
                % (input_count, explicit_count)
+        output_shape[start_axis + inferred_axis] = input_count / explicit_count

    output_count = count(output_shape)
    assert output_count == input_count, "[Reshape]output count[%d] must match input count[%d]" % (
@@ -117,6 +126,7 @@ def reshape_layer(input, name, shape, axis=0, num_axes=-1):
        output_shape = reshape_shape(input_shape, shape, axis, num_axes)

    output = fluid.layers.reshape(input, shape=output_shape, name=name)
+
    return output



--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/roipooling.py
+""" a custom layer for 'ROIPooling', maybe we should implement this in standard way.
+    more info can be found here: http://caffe.berkeleyvision.org/tutorial/layers/ROIPooling.html
+"""
+from .register import register
+
+
+def roipooling_shape(input_shapes, pooled_h, pooled_w, spatial_scale):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @out_max_val (bool): parameter from caffe's ROIPooling layer
+        @top_k (int): parameter from caffe's ROIPooling layer
+        @axis (int): parameter from caffe's ROIPooling layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+    assert len(input_shapes) == 2, "not valid input shape for roipooling layer"
+    base_fea_shape = input_shapes[0]
+    rois_shape = input_shapes[1]
+    output_shape = base_fea_shape
+    output_shape[0] = rois_shape[0]
+    output_shape[2] = pooled_h
+    output_shape[3] = pooled_w
+    return output_shape
+
+
+def roipooling_layer(inputs, name, pooled_h, pooled_w, spatial_scale):
+    """ build a layer of type 'ROIPooling' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @out_max_val (bool): parameter from caffe's ROIPooling layer
+        @top_k (int): parameter from caffe's ROIPooling layer
+        @axis (int): parameter from caffe's ROIPooling layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+
+    import paddle.fluid as fluid
+    assert len(inputs) == 2, "not valid input shape for roipooling layer"
+    base_fea = inputs[0]
+    rois = inputs[1][:, 1:5]
+    rois_fea = fluid.layers.roi_pool(base_fea, rois, pooled_h, pooled_w,
+                                     spatial_scale)
+
+    return rois_fea
+
+
+register(kind='ROIPooling', shape=roipooling_shape, layer=roipooling_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/custom_layers/select.py
+""" a custom layer for 'select' which is used to replace standard 'Slice' layer 
+    for converting layer with multiple different output tensors
+"""
+from .register import register
+
+
+def select_shape(input_shape, slice_point, axis=1):
+    """ calculate the output shape of this layer using input shape
+
+    Args:
+        @input_shape (list of num): a list of number which represents the input shape
+        @slice_point (list): parameter from caffe's Slice layer
+        @axis (int): parameter from caffe's Slice layer
+
+    Returns:
+        @output_shape (list of num): a list of numbers represent the output shape
+    """
+
+    input_shape = list(input_shape)
+    start = slice_point[0]
+    if len(slice_point) == 2:
+        end = slice_point[1]
+    else:
+        end = input_shape[axis]
+
+    assert end > start, "invalid slice_point with [start:%d, end:%d]"\
+             % (start, end)
+    output_shape = input_shape
+    output_shape[axis] = end - start
+    return output_shape
+
+
+def select_layer(input, name, slice_point, axis=1):
+    """ build a layer of type 'Slice' using fluid
+
+    Args:
+        @input (variable): input fluid variable for this layer
+        @name (str): name for this layer
+        @slice_point (list): parameter from caffe's Slice layer
+        @axis (int): parameter from caffe's Slice layer
+
+    Returns:
+        output (variable): output variable for this layer
+    """
+    import paddle.fluid as fluid
+    input_shape = list(input.shape)
+
+    start = slice_point[0]
+    if len(slice_point) == 2:
+        end = slice_point[1]
+    else:
+        end = input_shape[axis]
+
+    sections = []
+    if start > 0:
+        sections.append(start)
+
+    pos = len(sections)
+    sections.append(end - start)
+    if end != input_shape[axis]:
+        sections.append(input_shape[axis] - end)
+
+    outputs = fluid.layers.split(input, sections, dim=axis, name=name)
+    return outputs[pos]
+
+
+register(kind='Select', shape=select_shape, layer=select_layer)
--- a/fluid/image_classification/caffe2fluid/kaffe/layers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -16,7 +16,7 @@ LAYER_DESCRIPTORS = {
    'Concat': shape_concat,
    'ContrastiveLoss': shape_scalar,
    'Convolution': shape_convolution,
-    'Deconvolution': shape_not_implemented,
+    'Deconvolution': shape_deconvolution,
    'Data': shape_data,
    'Dropout': shape_identity,
    'DummyData': shape_data,
@@ -179,6 +179,11 @@ class LayerAdapter(object):
    @property
    def parameters(self):
        name = NodeDispatch.get_handler_name(self.kind)
+        if self.kind.lower() == "normalize":
+            name = "norm"
+        elif self.kind.lower() == "deconvolution":
+            name = "convolution"
+
        name = '_'.join((name, 'param'))
        try:
            return getattr(self.layer, name)
@@ -207,7 +212,9 @@ class LayerAdapter(object):

    @property
    def kernel_parameters(self):
-        assert self.kind in (NodeKind.Convolution, NodeKind.Pooling)
+        assert self.kind in (NodeKind.Convolution, NodeKind.Pooling,\
+                    NodeKind.Deconvolution)
+
        params = self.parameters
        k_h = self.get_kernel_value(params.kernel_h, params.kernel_size, 0)
        k_w = self.get_kernel_value(params.kernel_w, params.kernel_size, 1)
@@ -217,9 +224,25 @@ class LayerAdapter(object):
            params.stride_w, params.stride, 1, default=1)
        p_h = self.get_kernel_value(params.pad_h, params.pad, 0, default=0)
        p_w = self.get_kernel_value(params.pad_w, params.pad, 1, default=0)
-        return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w)
-

-KernelParameters = namedtuple('KernelParameters', [
-    'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w'
-])
+        dila_h = dila_w = 1
+        if self.kind in (NodeKind.Convolution, NodeKind.Deconvolution):
+            dila_len = len(params.dilation)
+            if dila_len == 2:
+                dila_h = params.dilation[0]
+                dila_w = params.dilation[1]
+            elif dila_len == 1:
+                dila_h = dila_w = params.dilation[0]
+            else:
+                assert dila_len == 0, "invalid length[%s] of dilation in convolution" % (
+                    dila_len)
+
+        return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w, dila_h, dila_w)
+
+
+KernelParameters = namedtuple(
+    'KernelParameters',
+    [
+        'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w',
+        'dila_h', 'dila_w'
+    ], )
--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -91,7 +91,7 @@ class Network(object):
                    name = '%s_%s' % (op_name, param_name)
                    v = fluid.global_scope().find_var(name)
                    w = v.get_tensor()
-                    w.set(data, place)
+                    w.set(data.reshape(w.shape()), place)
                except ValueError:
                    if not ignore_missing:
                        raise
@@ -144,6 +144,7 @@ class Network(object):
             relu=True,
             relu_negative_slope=0.0,
             padding=None,
+             dilation=1,
             group=1,
             biased=True):
        if padding is None:
@@ -173,6 +174,7 @@ class Network(object):
            num_filters=c_o,
            stride=[s_h, s_w],
            padding=padding,
+            dilation=dilation,
            groups=group,
            param_attr=fluid.ParamAttr(name=prefix + "weights"),
            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
@@ -183,6 +185,58 @@ class Network(object):

        return output

+    @layer
+    def deconv(self,
+               input,
+               k_h,
+               k_w,
+               c_o,
+               s_h,
+               s_w,
+               name,
+               relu=True,
+               relu_negative_slope=0.0,
+               padding=None,
+               dilation=1,
+               biased=True):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        c_i, h_i, w_i = input.shape[1:]
+
+        fluid = import_fluid()
+        prefix = name + '_'
+        leaky_relu = False
+        act = 'relu'
+        if relu is False:
+            act = None
+        elif relu_negative_slope != 0.0:
+            leaky_relu = True
+            act = None
+
+        p_h = padding[0]
+        p_w = padding[1]
+        h_o = (h_i - 1) * s_h - 2 * p_h + dilation * (k_h - 1) + 1
+        w_o = (w_i - 1) * s_w - 2 * p_w + dilation * (k_w - 1) + 1
+        output = fluid.layers.conv2d_transpose(
+            name=self.get_unique_output_name(name, 'conv2d_transpose'),
+            input=input,
+            num_filters=c_o,
+            output_size=[h_o, w_o],
+            filter_size=[k_h, k_w],
+            padding=padding,
+            stride=[s_h, s_w],
+            dilation=dilation,
+            param_attr=fluid.ParamAttr(name=prefix + "weights"),
+            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
+            act=act)
+
+        if leaky_relu:
+            output = fluid.layers.leaky_relu(output, alpha=relu_negative_slope)
+
+        return output
+
    @layer
    def relu(self, input, name):
        fluid = import_fluid()
@@ -256,6 +310,12 @@ class Network(object):
        return fluid.layers.sigmoid(
            input, name=self.get_unique_output_name(name, 'sigmoid'))

+    @layer
+    def tanh(self, input, name):
+        fluid = import_fluid()
+        return fluid.layers.tanh(
+            input, name=self.get_unique_output_name(name, 'tanh'))
+
    @layer
    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
        fluid = import_fluid()

--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -9,21 +9,6 @@ from ..transformers import (DataInjector, DataReshaper, NodeRenamer,
 from . import network


-def get_padding_type(kernel_params, input_shape, output_shape):
-    '''Translates Caffe's numeric padding to one of ('SAME', 'VALID').
-    Caffe supports arbitrary padding values, while Paddle only
-    supports 'SAME' and 'VALID' modes. So, not all Caffe paddings
-    can be translated to Paddle. There are some subtleties to
-    how the padding edge-cases are handled. These are described here:
-    https://github.com/Yangqing/caffe2/blob/master/caffe2/proto/caffe2_legacy.proto
-    '''
-    k_h, k_w, s_h, s_w, p_h, p_w = kernel_params
-    if p_h > 0 or p_w > 0:
-        return [p_h, p_w]
-    else:
-        return None
-
-
 class PaddleNode(object):
    '''An intermediate representation for Paddle operations.'''

@@ -78,10 +63,11 @@ class PaddleMapper(NodeMapper):
    def get_kernel_params(self, node):
        kernel_params = node.layer.kernel_parameters
        input_shape = node.get_only_parent().output_shape
-        padding = get_padding_type(kernel_params, input_shape,
-                                   node.output_shape)
-        # Only emit the padding if it's not the default value.
-        padding = {'padding': padding} if padding is not None else {}
+        padding = [kernel_params.pad_h, kernel_params.pad_w]
+        if padding[0] == 0 and padding[1] == 0:
+            padding = {}
+        else:
+            padding = {'padding': padding}
        return (kernel_params, padding)

    def map_convolution(self, node):
@@ -95,12 +81,34 @@ class PaddleMapper(NodeMapper):
            kwargs['group'] = group
        if not node.parameters.bias_term:
            kwargs['biased'] = False
+
+        if kernel_params.dila_h != 1 or kernel_params.dila_w != 1:
+            kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w)
+
        assert kernel_params.kernel_h == h
        assert kernel_params.kernel_w == w
        return MaybeActivated(node)(
            'conv', kernel_params.kernel_h, kernel_params.kernel_w, c_o,
            kernel_params.stride_h, kernel_params.stride_w, **kwargs)

+    def map_deconvolution(self, node):
+        (kernel_params, kwargs) = self.get_kernel_params(node)
+        h = kernel_params.kernel_h
+        w = kernel_params.kernel_w
+        c_o = node.output_shape[1]
+        c_i = node.parents[0].output_shape[1]
+        if not node.parameters.bias_term:
+            kwargs['biased'] = False
+
+        if kernel_params.dila_h != 1 or kernel_params.dila_w != 1:
+            kwargs['dilation'] = (kernel_params.dila_h, kernel_params.dila_w)
+
+        assert kernel_params.kernel_h == h
+        assert kernel_params.kernel_w == w
+        return MaybeActivated(node)(
+            'deconv', kernel_params.kernel_h, kernel_params.kernel_w, c_o,
+            kernel_params.stride_h, kernel_params.stride_w, **kwargs)
+
    def map_relu(self, node):
        return PaddleNode('relu')


--- a/fluid/image_classification/caffe2fluid/kaffe/shapes.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
@@ -6,6 +6,8 @@ from .errors import KaffeError
 Tensor4DShape = namedtuple('Tensor4DShape',
                           ['batch_size', 'channels', 'height', 'width'])

+Tensor3DShape = namedtuple('Tensor3DShape', ['batch_size', 'data1', 'data2'])
+
 Tensor2DShape = namedtuple('Tensor2DShape', ['batch_size', 'data'])

 ScalarShape = namedtuple('ScalarShape', ['batch_size'])
@@ -14,6 +16,8 @@ ScalarShape = namedtuple('ScalarShape', ['batch_size'])
 def make_tensor(batch_size, d1=None, d2=None, d3=None):
    if d3 is not None:
        return Tensor4DShape(batch_size, d1, d2, d3)
+    elif d1 is not None and d2 is not None:
+        return Tensor3DShape(batch_size, d1, d2)
    elif d1 is not None and d2 is None:
        return Tensor2DShape(batch_size, d1)
    elif d1 is None and d2 is None and d3 is None:
@@ -24,10 +28,14 @@ def make_tensor(batch_size, d1=None, d2=None, d3=None):


 def get_filter_output_shape(i_h, i_w, params, round_func):
-    o_h = (i_h + 2 * params.pad_h - params.kernel_h
-           ) / float(params.stride_h) + 1
-    o_w = (i_w + 2 * params.pad_w - params.kernel_w
-           ) / float(params.stride_w) + 1
+    dila_h = getattr(params, 'dila_h', 1)
+    dila_w = getattr(params, 'dila_w', 1)
+
+    o_h = (i_h + 2 * params.pad_h -
+           (dila_h * (params.kernel_h - 1) + 1)) / float(params.stride_h) + 1
+    o_w = (i_w + 2 * params.pad_w -
+           (dila_w * (params.kernel_w - 1) + 1)) / float(params.stride_w) + 1
+
    return (int(round_func(o_h)), int(round_func(o_w)))


@@ -97,6 +105,34 @@ def shape_convolution(node):
    return get_strided_kernel_output_shape(node, math.floor)


+def shape_deconvolution(node):
+    assert node.layer is not None
+    input_shape = node.get_only_parent().output_shape
+    h_i = input_shape.height
+    w_i = input_shape.width
+
+    params = node.layer.kernel_parameters
+    p_h = params.pad_h
+    p_w = params.pad_w
+
+    dila_h = params.dila_h
+    dila_w = params.dila_w
+
+    k_h = params.kernel_h
+    k_w = params.kernel_w
+
+    s_h = params.stride_h
+    s_w = params.stride_w
+
+    h_o = (h_i - 1) * s_h - 2 * p_h + dila_h * (k_h - 1) + 1
+    w_o = (w_i - 1) * s_w - 2 * p_w + dila_w * (k_w - 1) + 1
+
+    params = node.layer.parameters
+    has_c_o = hasattr(params, 'num_output')
+    c = params.num_output if has_c_o else input_shape.channels
+    return make_tensor(input_shape.batch_size, c, h_o, w_o)
+
+
 def shape_pool(node):
    global_pool = getattr(node.layer.parameters, 'global_pooling', False)
    if global_pool:

--- a/fluid/image_classification/caffe2fluid/kaffe/transformers.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
@@ -325,7 +325,8 @@ class ParameterNamer(object):
        for node in graph.nodes:
            if node.data is None:
                continue
-            if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct):
+            if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct,\
+                    NodeKind.Deconvolution):
                names = ('weights', )
                if node.parameters.bias_term:
                    names += ('biases', )
@@ -337,6 +338,8 @@ class ParameterNamer(object):
                names = ('scale', )
                if getattr(node.parameters, 'bias_term', False):
                    names = ('scale', 'offset')
+            elif node.kind == "Normalize":
+                names = ('scale', )
            else:
                warn('Unhandled parameters when naming this it[%s]' %
                     (node.kind))

--- a/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
+++ b/fluid/image_classification/data/ILSVRC2012/download_imagenet2012.sh
@@ -34,7 +34,7 @@ tar xf ${valid_tar} -C ${valid_folder}

 echo "Download imagenet label file: val_list.txt & train_list.txt"
 label_file=ImageNet_label.tgz
-label_url=http://imagenet-data.bj.bcebos.com/${label_file}
+label_url=http://paddle-imagenet-models.bj.bcebos.com/${label_file}
 wget -nd -c ${label_url}
 tar zxf ${label_file}

--- a/fluid/neural_machine_translation/transformer/README_cn.md
+++ b/fluid/neural_machine_translation/transformer/README_cn.md
@@ -110,7 +110,7 @@ python -u train.py \
 ```
 有关这些参数更详细信息的还请参考 `config.py` 中的注释说明。

-训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。在训练过程中，每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录，每个 iteration 将打印如下的日志到标准输出：
+训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用CPU训练(通过参数--divice CPU)，训练速度相对较慢。在训练过程中，每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录，每个 iteration 将打印如下的日志到标准输出：
 ```txt
 epoch: 0, batch: 0, sum loss: 258793.343750, avg loss: 11.069005, ppl: 64151.644531
 epoch: 0, batch: 1, sum loss: 256140.718750, avg loss: 11.059616, ppl: 63552.148438
@@ -154,9 +154,82 @@ perl multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt
 ```
 BLEU = 25.08, 58.3/31.5/19.6/12.6 (BP=0.966, ratio=0.967, hyp_len=61321, ref_len=63412)
 ```
+### 分布式训练

-### 参考文献
+transformer 模型支持同步或者异步的分布式训练。分布式的配置主要两个方面:
+
+1 命令行配置
+
+  - `--local`，有两个取值，`True`表示单机训练，而`False`表示使用分布式训练。默认为单机训练模式。
+
+  - `--sync`，有两个取值，但只有当`--local`参数为False才会产生影响，其中`True`表示同步训练模式，`False`表示异步训练模式。默认为同步训练模式。
+
+2 环境变量配置
+
+  在分布式训练模式下，会手动配置训练的trainer数量和pserver数量。在网络拓扑上，每一个trainer都会和每一个pserver相连，pserver作为服务端，而trainer作为客户端。下面分pserver和trainer说明具体的参数配置：
+
+1) pserver配置
+
+- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练，`0`标识是分布式，`1`标识是单机
+
+- `TRAINING_ROLE=PSERVER` 标识当前节点是pserver
+
+- `POD_IP=ip` 设置当前pserver使用对外服务的地址
+
+- `PADDLE_PORT=port` 设置当前pserver对外服务监听端口号，和`POD_IP`共同构成对外的唯一标识
+
+- `PADDLE_TRAINERS_NUM=num` 设置pserver连接的trainer的数量
+
+下面是配置的示例, 使用两个pserver, 192.168.2.2上的配置如下:
+```
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export POD_IP=192.168.2.2
+export PADDLE_TRAINERS_NUM=2
+export TRAINING_ROLE=PSERVER
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+192.168.2.3上的配置如下:
+```
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export POD_IP=192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export TRAINING_ROLE=PSERVER
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+2) trainer配置
+
+- `PADDLE_IS_LOCAL=[0|1]` 是否是分布式训练，`0`标识是分布式，`1`标识是单机
+
+- `TRAINING_ROLE=TRAINER` 标识当前节点是trainer
+
+- `PADDLE_PSERVERS=[ip1,ip2,……]` 设置pserver的ip地址,用于告知trainer互联的pserver的ip, 使用`,`分割
+
+- `PADDLE_TRAINER_ID=num` 设置当前节点的编号, 编号的取值范围为0到N-1的整数

+- `PADDLE_PORT=port` 设置请求的pserver服务端口号
+
+下面是配置的示例, 使用两个trainer, trainer 1上的配置如下:
+```
+export TRAINING_ROLE=TRAINER
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export PADDLE_TRAINER_ID=0
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+trainer 2上的配置如下:
+```
+export TRAINING_ROLE=TRAINER
+export PADDLE_PSERVERS=192.168.2.2,192.168.2.3
+export PADDLE_TRAINERS_NUM=2
+export PADDLE_TRAINER_ID=1
+export PADDLE_IS_LOCAL=0
+export PADDLE_PORT=6177
+```
+
+### 参考文献
 1. Vaswani A, Shazeer N, Parmar N, et al. [Attention is all you need](http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)[C]//Advances in Neural Information Processing Systems. 2017: 6000-6010.
 2. He K, Zhang X, Ren S, et al. [Deep residual learning for image recognition](http://openaccess.thecvf.com/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf)[C]//Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778.
 3. Ba J L, Kiros J R, Hinton G E. [Layer normalization](https://arxiv.org/pdf/1607.06450.pdf)[J]. arXiv preprint arXiv:1607.06450, 2016.

--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
@@ -3,6 +3,7 @@ import time
 import argparse
 import ast
 import numpy as np
+import multiprocessing

 import paddle
 import paddle.fluid as fluid
@@ -80,6 +81,20 @@ def parse_args():
        help='See config.py for all options',
        default=None,
        nargs=argparse.REMAINDER)
+    parser.add_argument(
+        '--local',
+        type=ast.literal_eval,
+        default=True,
+        help='Whether to run as local mode.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help="The device type.")
+    parser.add_argument(
+        '--sync', type=ast.literal_eval, default=True, help="sync mode.")
+
    args = parser.parse_args()
    # Append args related to dict
    src_dict = reader.DataReader.load_dict(args.src_vocab_fpath)
@@ -247,34 +262,73 @@ def split_data(data, num_part):
    ]


-def train(args):
-    dev_count = fluid.core.get_cuda_device_count()
+def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names,
+                 util_input_names, sum_cost, token_num):
+    # Context to do validation.
+    test_program = train_progm.clone()
+    with fluid.program_guard(test_program):
+        test_program = fluid.io.get_inference_program([avg_cost])

-    sum_cost, avg_cost, predict, token_num = transformer(
-        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
-        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-        ModelHyperParams.n_head, ModelHyperParams.d_key,
-        ModelHyperParams.d_value, ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    val_data = reader.DataReader(
+        src_vocab_fpath=args.src_vocab_fpath,
+        trg_vocab_fpath=args.trg_vocab_fpath,
+        fpattern=args.val_file_pattern,
+        use_token_batch=args.use_token_batch,
+        batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
+        pool_size=args.pool_size,
+        sort_type=args.sort_type,
+        start_mark=args.special_token[0],
+        end_mark=args.special_token[1],
+        unk_mark=args.special_token[2],
+        # count start and end tokens out
+        max_length=ModelHyperParams.max_length - 2,
+        clip_last_batch=False,
+        shuffle=False,
+        shuffle_batch=False)

-    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
-                                         TrainTaskConfig.warmup_steps,
-                                         TrainTaskConfig.learning_rate)
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=lr_scheduler.learning_rate,
-        beta1=TrainTaskConfig.beta1,
-        beta2=TrainTaskConfig.beta2,
-        epsilon=TrainTaskConfig.eps)
-    optimizer.minimize(sum_cost)
-
-    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
-    exe = fluid.Executor(place)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        main_program=test_program,
+        share_vars_from=train_exe)
+
+    def test(exe=test_exe):
+        test_total_cost = 0
+        test_total_token = 0
+        test_data = read_multiple(
+            reader=val_data.batch_generator,
+            count=dev_count if args.use_token_batch else 1)
+        for batch_id, data in enumerate(test_data()):
+            feed_list = []
+            for place_id, data_buffer in enumerate(
+                    split_data(
+                        data, num_part=dev_count)):
+                data_input_dict, util_input_dict, _ = prepare_batch_input(
+                    data_buffer, data_input_names, util_input_names,
+                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
+                    ModelHyperParams.n_head, ModelHyperParams.d_model)
+                feed_list.append(
+                    dict(data_input_dict.items() + util_input_dict.items()))
+
+            outs = exe.run(feed=feed_list,
+                           fetch_list=[sum_cost.name, token_num.name])
+            sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
+            test_total_cost += sum_cost_val.sum()
+            test_total_token += token_num_val.sum()
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl
+
+    return test
+
+
+def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
+               token_num, predict):
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
+        print "init fluid.framework.default_startup_program"
        exe.run(fluid.framework.default_startup_program())

    train_data = reader.DataReader(
@@ -305,77 +359,24 @@ def train(args):
    train_exe = fluid.ParallelExecutor(
        use_cuda=TrainTaskConfig.use_gpu,
        loss_name=sum_cost.name,
+        main_program=train_progm,
        build_strategy=build_strategy)

-    def test_context():
-        # Context to do validation.
-        test_program = fluid.default_main_program().clone(for_test=True)
-        test_exe = fluid.ParallelExecutor(
-            use_cuda=TrainTaskConfig.use_gpu,
-            main_program=test_program,
-            share_vars_from=train_exe)
-
-        val_data = reader.DataReader(
-            src_vocab_fpath=args.src_vocab_fpath,
-            trg_vocab_fpath=args.trg_vocab_fpath,
-            fpattern=args.val_file_pattern,
-            use_token_batch=args.use_token_batch,
-            batch_size=args.batch_size *
-            (1 if args.use_token_batch else dev_count),
-            pool_size=args.pool_size,
-            sort_type=args.sort_type,
-            start_mark=args.special_token[0],
-            end_mark=args.special_token[1],
-            unk_mark=args.special_token[2],
-            # count start and end tokens out
-            max_length=ModelHyperParams.max_length - 2,
-            clip_last_batch=False,
-            shuffle=False,
-            shuffle_batch=False)
-
-        def test(exe=test_exe):
-            test_total_cost = 0
-            test_total_token = 0
-            test_data = read_multiple(
-                reader=val_data.batch_generator,
-                count=dev_count if args.use_token_batch else 1)
-            for batch_id, data in enumerate(test_data()):
-                feed_list = []
-                for place_id, data_buffer in enumerate(
-                        split_data(
-                            data, num_part=dev_count)):
-                    data_input_dict, util_input_dict, _ = prepare_batch_input(
-                        data_buffer, data_input_names, util_input_names,
-                        ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
-                        ModelHyperParams.n_head, ModelHyperParams.d_model)
-                    feed_list.append(
-                        dict(data_input_dict.items() + util_input_dict.items()))
-
-                outs = exe.run(feed=feed_list,
-                               fetch_list=[sum_cost.name, token_num.name])
-                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
-                    1])
-                test_total_cost += sum_cost_val.sum()
-                test_total_token += token_num_val.sum()
-            test_avg_cost = test_total_cost / test_total_token
-            test_ppl = np.exp([min(test_avg_cost, 100)])
-            return test_avg_cost, test_ppl
-
-        return test
-
-    if args.val_file_pattern is not None:
-        test = test_context()
-
    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
                                                                             -1] + label_data_input_fields
    util_input_names = encoder_util_input_fields + decoder_util_input_fields
+
+    if args.val_file_pattern is not None:
+        test = test_context(train_progm, avg_cost, train_exe, dev_count,
+                            data_input_names, util_input_names, sum_cost,
+                            token_num)
+
    init = False
    for pass_id in xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            feed_list = []
            total_num_token = 0
-            lr_rate = lr_scheduler.update_learning_rate()
            for place_id, data_buffer in enumerate(
                    split_data(
                        data, num_part=dev_count)):
@@ -384,11 +385,16 @@ def train(args):
                    ModelHyperParams.eos_idx, ModelHyperParams.eos_idx,
                    ModelHyperParams.n_head, ModelHyperParams.d_model)
                total_num_token += num_token
-                feed_list.append(
-                    dict(data_input_dict.items() + util_input_dict.items() +
-                         {lr_scheduler.learning_rate.name: lr_rate}.items()))
-
-                if not init:  # init the position encoding table
+                feed_kv_pairs = data_input_dict.items() + util_input_dict.items(
+                )
+                if args.local:
+                    lr_rate = lr_scheduler.update_learning_rate()
+                    feed_kv_pairs += {
+                        lr_scheduler.learning_rate.name: lr_rate
+                    }.items()
+                feed_list.append(dict(feed_kv_pairs))
+
+                if not init:
                    for pos_enc_param_name in pos_enc_param_names:
                        pos_enc = position_encoding_init(
                            ModelHyperParams.max_length + 1,
@@ -408,10 +414,10 @@ def train(args):
                   np.exp([min(total_avg_cost, 100)])))
            init = True
        # Validate and save the model for inference.
-        print("epoch: %d, " % pass_id + (
-            "val avg loss: %f, val ppl: %f, " % test()
-            if args.val_file_pattern is not None else "") + "consumed %fs" % (
-                time.time() - pass_start_time))
+        print("epoch: %d, " % pass_id +
+              ("val avg loss: %f, val ppl: %f, " % test()
+               if args.val_file_pattern is not None else "") + "consumed %fs" %
+              (time.time() - pass_start_time))
        fluid.io.save_persistables(
            exe,
            os.path.join(TrainTaskConfig.ckpt_dir,
@@ -422,6 +428,107 @@ def train(args):
            data_input_names[:-2] + util_input_names, [predict], exe)


+def train(args):
+    # priority: ENV > args > config
+    is_local = os.getenv("PADDLE_IS_LOCAL", "1")
+    if is_local == '0':
+        args.local = False
+    print args
+
+    if args.device == 'CPU':
+        TrainTaskConfig.use_gpu = False
+
+    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+
+    if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu):
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+    else:
+        place = fluid.CUDAPlace(0)
+        dev_count = fluid.core.get_cuda_device_count()
+
+    exe = fluid.Executor(place)
+
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
+    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
+                                         TrainTaskConfig.warmup_steps,
+                                         TrainTaskConfig.learning_rate)
+
+    if args.local:
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_scheduler.learning_rate,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+    elif args.sync == False:
+        optimizer = fluid.optimizer.SGD(0.003)
+        optimizer.minimize(sum_cost)
+    else:
+        lr_decay = fluid.layers\
+         .learning_rate_scheduler\
+         .noam_decay(ModelHyperParams.d_model,
+            TrainTaskConfig.warmup_steps)
+
+        optimizer = fluid.optimizer.Adam(
+            learning_rate=lr_decay,
+            beta1=TrainTaskConfig.beta1,
+            beta2=TrainTaskConfig.beta2,
+            epsilon=TrainTaskConfig.eps)
+        optimizer.minimize(sum_cost)
+
+    if args.local:
+        print("local start_up:")
+        train_loop(exe,
+                   fluid.default_main_program(), dev_count, sum_cost, avg_cost,
+                   lr_scheduler, token_num, predict)
+    else:
+        port = os.getenv("PADDLE_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVERS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        t = fluid.DistributeTranspiler()
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+
+        if training_role == "PSERVER":
+            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
+                "PADDLE_PORT")
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+
+            print "psserver begin run"
+            with open('pserver_startup.desc', 'w') as f:
+                f.write(str(pserver_startup))
+            with open('pserver_prog.desc', 'w') as f:
+                f.write(str(pserver_prog))
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+
+            trainer_prog = t.get_trainer_program()
+            with open('trainer_prog.desc', 'w') as f:
+                f.write(str(trainer_prog))
+            train_loop(exe, trainer_prog, dev_count, sum_cost, avg_cost,
+                       lr_scheduler, token_num, predict)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
 if __name__ == "__main__":
    args = parse_args()
    train(args)
--- a/fluid/object_detection/README_cn.md
+++ b/fluid/object_detection/README_cn.md
@@ -60,7 +60,7 @@ cd data/coco
    ./pretrained/download_imagenet.sh
    ```

-#### 训练 PASCAL VOC 数据集
+#### 训练

 `train.py` 是训练模块的主要执行程序，调用示例如下：
  ```bash

--- a/fluid/object_detection/data/coco/download.sh
+++ b/fluid/object_detection/data/coco/download.sh
@@ -10,7 +10,7 @@ wget http://images.cocodataset.org/zips/val2017.zip
 wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
 wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 # Extract the data.
-echo "Extractint..."
+echo "Extracting..."
 unzip train2014.tar
 unzip val2014.tar
 unzip train2017.tar

--- a/fluid/object_detection/data/pascalvoc/download.sh
+++ b/fluid/object_detection/data/pascalvoc/download.sh
@@ -7,7 +7,7 @@ wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
 wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
 wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
 # Extract the data.
-echo "Extractint..."
+echo "Extracting..."
 tar -xf VOCtrainval_11-May-2012.tar
 tar -xf VOCtrainval_06-Nov-2007.tar
 tar -xf VOCtest_06-Nov-2007.tar