diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index 6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80..53d010434a8ebbe0184d84f588783f25186d606a 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -151,6 +151,7 @@ def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
                 if data == None:
                     break
             if iters == args.iterations:
+                reader_generator.close()
                 break
             if iters == args.skip_batch_num:
                 start_time = time.time()
@@ -252,6 +253,7 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
                 if data == None:
                     break
             if iters == args.iterations:
+                reader_generator.close()
                 break
             if args.profile and pass_id == 0 and batch_id == 5:
                 profiler.start_profiler("All")
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
index b782242a6632a5d42a512cf3b830d6e047c064ab..e4682ccb94e6fc60e184632dff9ee16a6bf16ec0 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
@@ -1,5 +1,5 @@
-服务器端部署 - Anakin
-#####################
+Anakin - 服务器端加速引擎
+#######################
 
 
 使用文档
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
deleted file mode 100644
index a5209e8560b31e9f0f776fba9a2b8c5bc150165c..0000000000000000000000000000000000000000
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-服务器端部署 - 原生引擎
-#######################
-
-..  toctree::
-    :maxdepth: 2
-
-    build_and_install_lib_cn.rst
-    native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst
index dea7c236619a0bdbf402f371571d947d1cdbba65..89166573eebca045e948046c69f3b7a3e0031d58 100644
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -10,7 +10,6 @@
 ..  toctree::
     :maxdepth: 2
 
-    deploy/index_native.rst
     deploy/index_anakin.rst
     deploy/index_mobile.rst
     development/contribute_to_paddle.md
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
index 8d645718e12e4d976a8e71de105e11f495191fbf..4f20843596aa676962a36241f59560ec2a41257b 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
@@ -21,7 +21,7 @@
 图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
 
 <p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true"  width="350" ><br/>
 图1. 通用图像分类展示
 </p>
 
@@ -30,7 +30,7 @@
 
 
 <p align="center">
-<img src="image/flowers.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
 图2. 细粒度图像分类展示
 </p>
 
@@ -38,7 +38,7 @@
 一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
 
 <p align="center">
-<img src="image/variations.png" width="550" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
 图3. 扰动图片展示[22]
 </p>
 
@@ -61,7 +61,7 @@
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
 
 <p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
 图4. ILSVRC图像分类Top-5错误率
 </p>
 
@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
 
 <p align="center">
-<img src="image/lenet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
 图5. CNN网络示例[20]
 </p>
 
@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
 
 <p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
 图6. 基于ImageNet的VGG16模型
 </p>
 
@@ -106,7 +106,7 @@ NIN模型主要有两个特点：
 Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
 
 <p align="center">
-<img src="image/inception.png" width="800" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
 图7. Inception模块
 </p>
 
@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没
 GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
 
 <p align="center">
-<img src="image/googlenet.jpeg" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
 图8. GoogleNet[12]
 </p>
 
@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
 
 <p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
 图9. 残差模块
 </p>
 
 图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
 
 <p align="center">
-<img src="image/resnet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
 图10. 基于ImageNet的ResNet模型
 </p>
 
@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
 
 <p align="center">
-<img src="image/cifar.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
 图11. CIFAR10数据集[21]
 </p>
 
@@ -377,7 +377,7 @@ test_reader = paddle.batch(
 `event_handler_plot`可以用来利用回调数据来打点画图:
 
 <p align="center">
-<img src="image/train_and_test.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
 图12. 训练结果
 </p>
 
@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
 图13是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
 
 <p align="center">
-<img src="image/plot.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
 图13. CIFAR10数据集上VGG模型的分类错误率
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png
deleted file mode 100644
index f3c5f2f7b0c84f83382b70124dcd439586ed4eb0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
deleted file mode 100644
index ca8f858a902ea723d886d2b88c2c0a1005301c50..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
deleted file mode 100644
index 38b21f21604b1bb84fc3f6aa96bd5fce45d15a55..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
deleted file mode 100644
index 647c822e52cd55d50e5f207978f5e6ada86cf34c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
deleted file mode 100644
index 04245cef60fe7126ae4c92ba8085273965078bee..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
deleted file mode 100644
index 249dbf96df61c3352ea5bd80470f6c4a1e03ff10..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
deleted file mode 100644
index 4660ac122e9d533023a21154d35eee29e3b08d27..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
deleted file mode 100644
index 9591a0c1e8c0165c40ca560be35a7b9a91cd5027..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png
deleted file mode 100644
index 39580c20b583f2a15d17fd124a572c84e6e2db1d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
deleted file mode 100644
index 77f785e03bacd38c4c64a817874a58ff3298d2f3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png
deleted file mode 100644
index 97a1e3eee45c0db95e6a943ca3b8c0cf6c34d4b6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
deleted file mode 100644
index 57e45cc0c27dd99b9918de2ff1228bc6b65f7424..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png
deleted file mode 100644
index 147e575bf49086811c43420d5a9c8f749e2da405..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
deleted file mode 100644
index 0aeb4f254639fdbf18e916dc219ca61602596d85..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
deleted file mode 100644
index c500eb01a90190ff66150871fe83ec275e2de8d7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
deleted file mode 100644
index c6336a9a69b95dc978719ce68896e3e752e67fed..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png
deleted file mode 100644
index b4ebbbe6a50f5fd7cd0cccb52cdac5653e34654c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png
deleted file mode 100644
index 88c60fe87f802c5ce560bb15bbdbd229aeafc4e4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
deleted file mode 100644
index 6270eefcfd7071bc1643ee06567e5b81aaf4c177..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst
index e1fd226116d88fbf137741242b304b367e598ba5..0fcb008e0a7773e81e5124da09fe07366130b924 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
@@ -6,7 +6,7 @@
 ..  todo::
 
     概述
-
+    
 ..  toctree::
     :maxdepth: 2
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
index 47e948bd1ffc0ca692dc9899193e94831ce4234b..0891f5b6b16a1b715b44db6c47ba079adfcad4c5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
 
 <div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
 图1. 依存句法分析句法树示例
 </div>
 
@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 我们继续以上面的这句话为例，图1展示了BIO表示方法。
 
 <div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%"  align=center /><br>
 图2. BIO标注方法示例
 </div>
 
@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 图3是最终得到的栈式循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>
 
@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>
 
@@ -79,7 +79,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
 
 <p align="center">  
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>
 
@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
 
 <div  align="center">  
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
deleted file mode 100644
index e63f5ebd6d00f2e4ecf97b9ab2027e74683013f2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
deleted file mode 100644
index f0a195c24d9ee493f96bb93c28a99e70566be7a4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
deleted file mode 100644
index e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
deleted file mode 100644
index 93b44dd4874402ef29ad7bd7d94147609b92e309..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
deleted file mode 100644
index 592f7ee23bdc88a9a35059612e5ab880bbc9d34b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
deleted file mode 100644
index c3646312e48db977402fb353dc0c9b4d02269bf4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
deleted file mode 100644
index 9265b671735940ed6549e2980064d2ce08baae64..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
deleted file mode 100644
index 23f4f45b603e3d60702af2b2464d10fc8deed061..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
deleted file mode 100644
index 0778fda74b2ad22ce4b631791a7b028cdef780a5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
deleted file mode 100644
index 3d2914c726b5f4c46e66dfa85d4e88649fede6b3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
deleted file mode 100644
index 0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
index f37c559921483a3d7c619ed74903df56b0584bd5..fa2b930be0d26d816566599cece8afbedc1157e0 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
@@ -11,10 +11,10 @@
 为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
 
 近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-![nmt](./image/nmt.png)
-<p align="center">
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
 图1. 基于神经网络的机器翻译系统
-</p>
+</div>
 
 本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
 
@@ -45,18 +45,20 @@
 
 具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
 
-![bi_rnn](./image/bi_rnn.png)
-<p align="center">
-图3. 按时间步展开的双向循环神经网络
-</p>
+
+<div align="center">
+<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
+图2. 按时间步展开的双向循环神经网络
+</div>
 
 ### 编码器-解码器框架
 
 编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
 ![encoder_decoder](./image/encoder_decoder.png)
-<p align="center">
-图4. 编码器-解码器框架
-</p>
+<div align="center">
+<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
+图3. 编码器-解码器框架
+</div>
 
 #### 编码器
 
@@ -69,16 +71,14 @@
 3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
 
 第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
-
-![encoder_attention](./image/encoder_attention.png)
-<p align="center">
-图5. 使用双向GRU的编码器
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
+图4. 使用双向GRU的编码器
+</div>
 
 #### 解码器
 
 机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
 $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
 其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
@@ -100,7 +100,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
 
 使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
 
 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
deleted file mode 100644
index 9d8efd50a49d0305586f550344472ab94c93bed3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
deleted file mode 100644
index 4b35c88fc8ea2c503473c0c15711744e784d6af6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
deleted file mode 100644
index 1b355e7786d25487a3f564af758c2c52c43b4690..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
deleted file mode 100644
index 3728f782ee09d9308d02b42305027b2735467ead..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
deleted file mode 100644
index 28d7a15a3bd65262bde22a3f41b5aa78b46b368a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
deleted file mode 100644
index ea8585565da1ecaf241654c278c6f9b15e283286..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
deleted file mode 100644
index 60aee0017de73f462e35708b1055aff8992c03e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
deleted file mode 100644
index 6b73798fe632e0873b35c117b86f347c8cf3116a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
deleted file mode 100644
index 0cde685b84106650a4df18ce335a23e6338d3d11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
deleted file mode 100644
index a6af429f23f0f7e82650139bbd8dcbef27a34abe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
deleted file mode 100644
index bf56d73ebf297fadf522389c7b6836dd379aa097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
deleted file mode 100644
index 557310e044b2b6687e5ea6895417ed946ac7bc11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
index 0f7c97021f8ad463fc51ed169604b789ea068c3d..4b79e62f74e587fcd939d9f9e911af80992ea6a3 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
@@ -37,7 +37,7 @@ Prediction Score is 4.25
 YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
 
 <p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
 图1. YouTube 推荐系统结构
 </p>
 
@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐
 首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
 
 <p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
 图2. 候选生成网络结构
 </p>
 
@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
 卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
 
 <p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
 图3. 卷积神经网络文本分类模型
 </p>
 
@@ -107,7 +107,7 @@ $$\hat c=max(c)$$
 
 <p align="center">
 
-<img src="image/rec_regression_network.png" width="90%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
 图4. 融合推荐模型
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
deleted file mode 100644
index c213608e769f69fb2cfe8597f8e696ee53730e3d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
deleted file mode 100644
index 8aedb2204371e7691140ceffa5992f6080bbf097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
deleted file mode 100644
index 4298567ac5600173343299999965b20612e7affe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
deleted file mode 100644
index a98e7cc67606b31e4c945f7eb907563e46dcef56..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
deleted file mode 100644
index 7fd97b9cc3a0b9105b41591af4e8f8e4646bd681..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
deleted file mode 100644
index 90c9b09fb78db98391ee199934f2d16efd6d6652..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
deleted file mode 100644
index 6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
deleted file mode 100644
index 61e63d9147cbc2901706ef80776d706e5368c3c5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
deleted file mode 100644
index fbcae2be81141be955076e877b94b0ea5d7e4d4a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
index 5844b6fe137c2401a04e47b5b489434ee9b363f1..9900dfb9a67dc6f8940bd7dd3abfa15ac8a3488f 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
@@ -37,7 +37,7 @@
 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
 
 <p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
 图1. 循环神经网络按时间展开的示意图
 </p>
 
@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
 其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
 
 <p align="center">
-<img src="image/lstm.png" width = "65%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
 图2. 时刻$t$的LSTM [7]
 </p>
 
@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
 如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
 
 <p align="center">
-<img src="image/stacked_lstm.jpg" width=450><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
 图3. 栈式双向LSTM用于文本分类
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
deleted file mode 100644
index 98fbea413a98a619004ca669c67f5f867fe974c9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
deleted file mode 100644
index d73a00bf2c1fca2f9b8c26bccf5ea844fa1db50b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
deleted file mode 100644
index 26c904102a6e6c4e30f0048b81373ae8c148b355..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
deleted file mode 100644
index 6b2adf70f2b5112a2e82505da5cff9f5fd0c6298..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
deleted file mode 100644
index 8b5dbd726178b5555c513294e7b10a81acc96ff5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
index d21c7ddcc501f863b5ce672123dbbc6c26528f15..2c68cdac4f10319359b74bc92569dfd3f65380b5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
@@ -34,7 +34,7 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
 
 <p align="center">
-    <img src = "image/2d_similarity.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
     图1. 词向量的二维投影
 </p>
 
@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
 
 <p align="center">
-       <img src="image/nnlm.png" width=500><br/>
+       <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
        图2. N-gram神经网络模型
 </p>
 
@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
 
 <p align="center">
-    <img src="image/cbow.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
     图3. CBOW模型
 </p>
 
@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
 
 <p align="center">
-    <img src="image/skipgram.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
     图4. Skip-gram模型
 </p>
 
@@ -194,7 +194,7 @@ dream that one day <e>
 本配置的模型结构如下图所示：
 
 <p align="center">
-    <img src="image/ngram.png" width=400><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
     图5. 模型配置中的N-gram神经网络模型
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
deleted file mode 100644
index 384f59919a2c8dedb198e97d51434616648932e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
deleted file mode 100644
index 76b7d4bc0f99372465bd9aa34721513d39ad0776..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
deleted file mode 100644
index d985c393e618e9b79df05e4ff0ae57ccc93744d0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
deleted file mode 100644
index 2e16ab2f443732b8ef5404a8e7cd2457bc5eee23..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
deleted file mode 100644
index 2449dce6a86b43b1b997ff418ed0dba56848463f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
deleted file mode 100644
index 1e0b40a8f7aefdf46d42761305511f281c08e595..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
deleted file mode 100644
index 158bd64b8f8729dea67834a8d591d21bce8b8564..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
deleted file mode 100644
index ce4a8bf4769183cbaff91793753d2350a3ce936c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
deleted file mode 100644
index a3ab385845d3dc8b5c670bae91225bc8dd47a8bb..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
deleted file mode 100644
index 3c36c6d1f66eb98ea78c0673965d02a4ee3aa288..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
index 27d25b43961ce74d73e391b735369501fb80a231..9574dbea2f9a39bb196b61bb4fd12ba7c378f75a 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-    <img src = "image/predictions.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
     图1. 预测值 V.S. 真实值
 </p>
 
@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 ### 训练过程
 
 定义好模型结构之后，我们要通过以下几个步骤进行模型训练
-
  1. 初始化参数，其中包括权重$\omega_i$和偏置$b$，对其进行初始化（如0均值，1方差）。
-
  2. 网络正向传播计算网络输出和损失函数。
-
  3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
-
  4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
 
 ## 数据集
@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
 
 <p align="center">
-    <img src = "image/ranges.png" width=550><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
     图2. 各维属性的取值范围
 </p>
 
@@ -199,10 +195,12 @@ step = 0
 def event_handler_plot(event):
     global step
     if isinstance(event, fluid.EndStepEvent):
-        if event.step % 10 == 0: # record the test cost every 10 seconds
+        if step % 10 == 0:   # record a train cost every 10 batches
+            plot_cost.append(train_title, step, event.metrics[0])
+
+        if step % 100 == 0:  # record a test cost every 100 batches
             test_metrics = trainer.test(
                 reader=test_reader, feed_order=feed_order)
-
             plot_cost.append(test_title, step, test_metrics[0])
             plot_cost.plot()
 
@@ -210,12 +208,13 @@ def event_handler_plot(event):
                 # If the accuracy is good enough, we can stop the training.
                 print('loss is less than 10.0, stop')
                 trainer.stop()
-
-        # We can save the trained parameters for the inferences later
-        if params_dirname is not None:
-            trainer.save_params(params_dirname)
-
         step += 1
+
+    if isinstance(event, fluid.EndEpochEvent):
+        if event.epoch % 10 == 0:
+            # We can save the trained parameters for the inferences later
+            if params_dirname is not None:
+                trainer.save_params(params_dirname)
 ```
 
 ### 开始训练
@@ -231,11 +230,10 @@ trainer.train(
     event_handler=event_handler_plot,
     feed_order=feed_order)
 ```
-
-<p align="center">
-    <img src = "image/train_and_test1.png" width=400><br/>
-    图3. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
+图3 训练结果
+</div>
 
 
 ## 预测
@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
 batch_size = 10
 test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
 test_data = test_reader().next()
-test_feat = numpy.array([data[0] for data in test_data]).astype("float32")
-test_label = numpy.array([data[1] for data in test_data]).astype("float32")
+test_x = numpy.array([data[0] for data in test_data]).astype("float32")
+test_y = numpy.array([data[1] for data in test_data]).astype("float32")
 
-results = inferencer.infer({'x': test_feat})
+results = inferencer.infer({'x': test_x})
 
 print("infer results: (House Price)")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, results[0][k]))
+for idx, val in enumerate(results[0]):
+    print("%d: %.2f" % (idx, val))
 
 print("\nground truth:")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, test_label[k]))
+for idx, val in enumerate(test_y):
+    print("%d: %.2f" % (idx, val))
 ```
 
 ## 总结
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
deleted file mode 100644
index 27e4acb1313794f52ad9ad9e874cdadd197ff41f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png
deleted file mode 100644
index f111c7cd766b7e9981513cc8c65be87dbbf3a79e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
deleted file mode 100644
index 5325df4800985983e17476f007658d1cdb170b1c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png
deleted file mode 100644
index 6d6a079bfdcc33617f6cf36612b271b48be6304f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png
deleted file mode 100644
index bcd304a6a0baf30ecfbc43e08fc0aca179d05958..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
index 3289116991cb8ebaa4a6fb78e100ce16f633d69c..e6f89b23a95d1a07565f3e0a285e9c3f921930df 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -6,8 +6,8 @@
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
 
 <p align="center">
-    <img src="image/mnist_example_image.png" width="400"><br/>
-    图1. MNIST图片示例
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
+图1. MNIST图片示例
 </p>
 
 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy loss），公式如下：
 
-$$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
+$$  L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
 
 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/softmax_regression.png" width=400><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
 图2. softmax回归网络结构图<br/>
 </p>
 
@@ -54,16 +54,14 @@ $$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
 Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
 
 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
-
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-
 3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
 
 
 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/mlp.png" width=500><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
 图3. 多层感知器网络结构图<br/>
 </p>
 
@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
 
 <p align="center">
-<img src="image/cnn.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
 图4. LeNet-5卷积神经网络结构<br/>
 </p>
 
@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
 
 <p align="center">
-<img src="image/conv_layer.png" width='750'><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
 图5. 卷积层图片<br/>
 </p>
 
@@ -98,7 +96,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 #### 池化层
 
 <p align="center">
-<img src="image/max_pooling.png" width="400px"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
 图6. 池化层图片<br/>
 </p>
 
@@ -106,8 +104,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 
 更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
 
-### 常见激活函数介绍
-
+### 常见激活函数介绍  
 - sigmoid激活函数： $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
 
 - tanh激活函数： $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
 我们建议使用 Fluid API，因为它更容易学起来。
 
 下面是快速的 Fluid API 概述。
-
 1. `inference_program`：指定如何从数据输入中获得预测的函数。
 这是指定网络流的地方。
 
-2. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
 这是指定损失计算的地方。
 
-3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
 
-4. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
 通过 `event_handler` 回调函数，用户可以监控培训的进展。
 
-5. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
-
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
 然后，它可以推断数据和返回预测。
 
 在这个演示中，我们将深入了解它们。
@@ -240,6 +235,7 @@ def train_program():
     acc = fluid.layers.accuracy(input=predict, label=label)
     return [avg_cost, acc]
 
+
 ```
 
 #### Optimizer Function 配置
@@ -255,9 +251,9 @@ def optimizer_program():
 
 下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
 
-下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
 
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader 。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
 
 ```python
 train_reader = paddle.batch(
@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
 trainer = fluid.Trainer(
     train_func=train_program, place=place, optimizer_func=optimizer_program)
-
 ```
 
 #### Event Handler 配置
@@ -315,11 +310,10 @@ def event_handler(event):
 
 `event_handler_plot` 可以用来在训练过程中画图如下：
 
-
-<p align="center">
-<img src="image/train_and_test2.png" width="400"><br/>
-图7. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
+图7 训练结果
+</div>
 
 
 ```python
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
deleted file mode 100644
index 3f5cdaacdc6acce41c5c6c99649be46685cf9903..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png
deleted file mode 100644
index bc1a9a4ccf81972dc0d69cf4c808a52218e14d61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
deleted file mode 100644
index 65bd17eacd41bbdbdb042bd1ba366eb53663b410..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png
deleted file mode 100644
index 77524754df906ab096e120bd657449f4565c3418..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
deleted file mode 100644
index c751892ba0be3ae803b5933c3f33487ecfb6fe7f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
deleted file mode 100644
index 030cd60d3b4af9aecd4941204da4ad15f6e1189f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
deleted file mode 100644
index 90b02fa2a735cfcc9efb2de90906325dedcb358c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png
deleted file mode 100644
index c626723512b6ee02abd55e5bab65e7629d130522..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
deleted file mode 100644
index 9f4d26cd8da32201d0a5e9c72d466301dd2b42a1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png
deleted file mode 100644
index 1fedea6a75abbf132cbbcf8ab10ce045997d697a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
deleted file mode 100644
index f5a478fdc24f29c17555a2f1451f3f5a079faed9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png
deleted file mode 100644
index 7d5508a1eccfcea1925f438043ee93b57769bebf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
deleted file mode 100644
index 4edd7cabf8a2282f6392ac1421c7ca4afb288589..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
deleted file mode 100644
index 40b98298288b9c406fce1cbca9c913753020a94d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png
deleted file mode 100644
index 833d3c663c94dd2d57fd19686949ded37a91f541..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
deleted file mode 100644
index 47204941af7f22e68386a70a06ec4f122b83e262..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png
deleted file mode 100644
index 6fa0a951d5262effb707e3e15af8cb900e5560b8..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png
deleted file mode 100644
index 5cb87b450d0398bcfaec0e647c362052069797e7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png and /dev/null differ
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
similarity index 100%
rename from doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/index.rst b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..45e1a2883773b92ed47ef8d51417bbdcd060b4ec
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
@@ -0,0 +1,11 @@
+############
+模型预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
similarity index 94%
rename from doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
index aa9377c112856693cda72779bd399f2415d716f0..21a6fe5cf54d0c0c760ade4ba602024ffa29675f 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -4,11 +4,13 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
 
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
 包括
 
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
+-  库文件 ``libpaddle_inference_api.so`` 或
+   ``libpaddle_inference_api.a``
 
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
 
@@ -95,7 +97,8 @@ engine
     CHECK(predictor->Run(slots, &outputs));
     // 获取 outputs ...
 
-编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 和
+``libpaddle_inference_api.a/.so`` 便可。
 
 详细代码参考
 ------------
diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst
index 453cb71cfdf72e031ce0f0517e2db936eca38dfc..377631109d8f65c149b12cd2a0e4da920fdf4def 100644
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -15,4 +15,5 @@
     howto/training/index
     howto/debug/index
     howto/evaluation/index
+    howto/inference/index
     models/index.rst
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index ed4e67879c795258683b094cfaeaff9063d66848..b6ae930b7155d15d24b287cc3eed50f2aeaa5599 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -170,6 +170,7 @@ paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], vara
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None))
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
+paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index cd00b7de7338982308acfa1f1e8c38e010c6a43b..c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -46,7 +46,7 @@ struct CastDataLayout {
   const std::vector<int> axis_;
 
   template <typename T>
-  void operator()() {
+  void apply() {
     auto place = ctx_->GetPlace();
 
     if (platform::is_cpu_place(place)) {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 84691a2059124960a3213802fec0863f8abe6df7..8ad2fb5f3ffd9641932bbbb024a31e81d31dc9bb 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -26,75 +26,40 @@ namespace framework {
 extern proto::VarType::Type ToDataType(std::type_index type);
 extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
-#if !defined(_WIN32)
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
   switch (type) {
     case proto::VarType::FP16:
-      visitor.template operator()<platform::float16>();
+      visitor.template apply<platform::float16>();
       break;
     case proto::VarType::FP32:
-      visitor.template operator()<float>();
+      visitor.template apply<float>();
       break;
     case proto::VarType::FP64:
-      visitor.template operator()<double>();
+      visitor.template apply<double>();
       break;
     case proto::VarType::INT32:
-      visitor.template operator()<int>();
+      visitor.template apply<int>();
       break;
     case proto::VarType::INT64:
-      visitor.template operator()<int64_t>();
+      visitor.template apply<int64_t>();
       break;
     case proto::VarType::BOOL:
-      visitor.template operator()<bool>();
+      visitor.template apply<bool>();
       break;
     case proto::VarType::UINT8:
-      visitor.template operator()<uint8_t>();
+      visitor.template apply<uint8_t>();
       break;
     case proto::VarType::INT16:
-      visitor.template operator()<int16_t>();
+      visitor.template apply<int16_t>();
       break;
     case proto::VarType::INT8:
-      visitor.template operator()<int8_t>();
+      visitor.template apply<int8_t>();
       break;
     default:
       PADDLE_THROW("Not supported %d", type);
   }
 }
-#else
-// the msvc compiler do not implement two-stage name lookup correctly.
-template <typename Visitor>
-inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  switch (type) {
-    case proto::VarType::FP16:
-      visitor.operator()<platform::float16>();
-      break;
-    case proto::VarType::FP32:
-      visitor.operator()<float>();
-      break;
-    case proto::VarType::FP64:
-      visitor.operator()<double>();
-      break;
-    case proto::VarType::INT32:
-      visitor.operator()<int>();
-      break;
-    case proto::VarType::INT64:
-      visitor.operator()<int64_t>();
-      break;
-    case proto::VarType::BOOL:
-      visitor.operator()<bool>();
-      break;
-    case proto::VarType::UINT8:
-      visitor.operator()<uint8_t>();
-      break;
-    case proto::VarType::INT16:
-      visitor.operator()<int16_t>();
-      break;
-    default:
-      PADDLE_THROW("Not supported %d", type);
-  }
-}
-#endif  // _WIN32
 
 extern std::string DataTypeToString(const proto::VarType::Type type);
 extern size_t SizeOfType(std::type_index type);
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 5a57ec20585c26dbcd4251464718fc819148a7a5..d79f8cacb5f4727defc77380371e57bcea65f068 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -37,7 +37,7 @@ struct CastDataType {
   const platform::DeviceContext* ctx_;
 
   template <typename OutType>
-  void operator()() {
+  void apply() {
     auto* in_begin = in_.data<InType>();
     auto* in_end = in_begin + in_.numel();
     auto* out_begin = out_->mutable_data<OutType>(in_.place());
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index e28264eb32756f77ef5baed3dff77ba9f0943160..bd6153c0c736f6e32378eebcbf6c4d7e402c9b42 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -31,7 +31,7 @@ struct ReduceLoDTensor {
       : src_tensors_(src), dst_tensor_(*dst) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     PADDLE_ENFORCE(!src_tensors_.empty());
     auto &t0 = *src_tensors_[0];
     PADDLE_ENFORCE_NE(t0.numel(), 0);
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index d2d051a69a33a38535e67227d4cc62f5b35e430c..0278ade6763ec614701674691797d766878a378e 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+
+#include <string>
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -216,11 +219,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors(
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-       W_output_w0.data<float>(), W_cell_w0.data<float>()});
+      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
   std::array<const float*, 4> tensors1(
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-       W_output_w1.data<float>(), W_cell_w1.data<float>()});
+      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -243,8 +246,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
   std::array<const float*, 4> tensors(
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-       B_cell.data<float>()});
+      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()}});
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 024397067c50fd2d82c6efd07a250fef4d9a2187..7bed1570507f97c8f84da5b6138ff49a2912dc91 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -13,38 +13,37 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  std::unordered_set<int> fused_ops({// first lstm
-                                     13, 15, 16,
-                                     // second lstm
-                                     23, 25, 26});
+std::string GenNodeName(const std::string& prefix, const std::string& name) {
+  return prefix + "/" + name;
+}
 
-  pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
-                   "any_node");
+void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                  bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::LSTM(pattern, name_scope, fc_out);
+  // LOG(INFO) << "\n" << pattern->DotString();
+}
 
-  std::unordered_set<Node*> marked_nodes;
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
 
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
-    marked_nodes.insert(id);
-  };
-  gpd(graph.get(), handler);
+  BuildPattern(pattern, name_scope, with_fc_bias);
 
   // Create New OpDesc
   auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
-                          int bias, int hidden, int cell, int xx) {
+                          int bias, int hidden, int cell, int xx, int fc_bias) {
 #define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
     GET_NODE(input);
     GET_NODE(weight_x);
@@ -62,12 +61,33 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     SET_IN(WeightX, weight_x);
     SET_IN(WeightH, weight_h);
     SET_IN(Bias, bias);
-#undef GET_NODE
 #undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = name_scope + "_bias.new";
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* lstm_bias_var = scope->FindVar(bias_n->Name());
+      PADDLE_ENFORCE(lstm_bias_var);
+      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(lstm_bias_tensor.dims());
+
+      GET_NODE(fc_bias);
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
 
-    VLOG(4) << "hidden_n: " << hidden_n->Name();
-    VLOG(4) << "cell: " << cell_n->Name();
-    VLOG(4) << "xx: " << xx_n->Name();
+#undef GET_NODE
 
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
@@ -76,7 +96,7 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     op_desc.SetOutput("XX", {xx_n->Name()});
     op_desc.SetOutput("BatchedInput", {"blstm_0.tmp_2"});
     op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", false);
+    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
 
 #define TMP_NAME(x) "at.new.tmp." #x
 #define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
@@ -85,8 +105,8 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     OP_SET_OUT(ReorderedH0);
     OP_SET_OUT(ReorderedC0);
 #undef OP_SET_OUT
-    auto* op = graph->CreateOpNode(&op_desc);
 
+    auto* op = graph->CreateOpNode(&op_desc);
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
     auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 
@@ -95,7 +115,6 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     TMP_NEW(BatchedHidden);
     TMP_NEW(ReorderedH0);
     TMP_NEW(ReorderedC0);
-
 #undef TMP_NEW
 #undef TMP_NAME
 
@@ -111,32 +130,69 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     return op;
   };
 
-  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
-  lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
+  int fusion_count{0};
 
-  // remove all the nodes
+  auto fc_no_bias_handler = [&](
+      const GraphPatternDetector::subgraph_t& subgraph, Graph* g) {
 
-  for (auto* node : marked_nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else {
-        it++;
-      }
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else {
-        it++;
-      }
+    GET_NODE(x);
+    GET_NODE(w);
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(lstm);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+    GET_NODE(Cell);
+
+    if (with_fc_bias) {
+      GET_NODE(fc_bias);
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+    } else {
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
     }
-  }
+#undef GET_NODE
+
+    // Remove unneeded nodes.
+    std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+
+    GraphSafeRemoveNodes(graph, marked_nodes);
+
+    ++fusion_count;
+  };
+
+  gpd(graph, fc_no_bias_handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
 
+  AddStatis(fusion_count);
   return graph;
 }
 
@@ -144,4 +200,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 74b08ae558b12c9328db58687cd01edbc37291a8..5a6687872eb3ab4a032227fda9ff0e7f5254670b 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,20 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class FCLstmFusePass : public Pass {
+// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
+
+// Just FC without bias
+class FCLstmFusePass : public FusePassBase {
  public:
   virtual ~FCLstmFusePass() {}
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_lstm_fuse"};
+};
+
+class MulLstmFusePass : public FusePassBase {
+ public:
+  virtual ~MulLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 55e495a0ed75c3a09703438dcfe01ca8f9d36118..ae8496204d4aeb88c04154d571325d440274e821 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -167,7 +167,6 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
-  int node_count_{0};
 };
 
 bool IsControlDepVar(const ir::Node &var);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 945ab110b148c320b6626cadaa47d483df68419e..f651ab635eadc9f248964e91dceebf3aa9c42926 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -71,7 +72,11 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
 
 void GraphPatternDetector::operator()(Graph* graph,
                                       GraphPatternDetector::handle_t handler) {
-  if (!MarkPDNodesInGraph(*graph)) return;
+  if (!MarkPDNodesInGraph(*graph)) {
+    LOG(INFO) << "Mark failed";
+    return;
+  }
+
   auto subgraphs = DetectPatterns();
   UniquePatterns(&subgraphs);
   RemoveOverlappedMatch(&subgraphs);
@@ -87,7 +92,7 @@ void GraphPatternDetector::operator()(Graph* graph,
 }
 
 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
-  VLOG(4) << "mark pdnodes in graph";
+  VLOG(3) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto& node : GraphTraits::DFS(graph)) {
@@ -107,6 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
     }
   }
   VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+
   return !pdnodes2nodes_.empty();
 }
 
@@ -357,7 +363,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
   assert_is_op_input(op_type);
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->outputs) {
-      if (IsNthInput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthInput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -368,7 +376,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->inputs) {
-      if (IsNthOutput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthOutput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -412,6 +422,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_output(const std::string& op_type,
+                                    const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_output(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
@@ -424,6 +440,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_input(const std::string& op_type,
+                                   const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_input(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
   assert_is_op(op_type);
   asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
@@ -439,6 +461,128 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
   return this;
 }
 
+bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Input(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Output(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Output(argument)[nth];
+}
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else
+        it++;
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else
+        it++;
+    }
+  }
+}
+bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
+                     PDNode* x, bool with_bias) {
+  // Create Operators
+  PDNode* elementwise_add_op{nullptr};
+  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
+  if (with_bias) {
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
+  }
+  // Create variables
+  // w
+  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
+                             ->AsInput()
+                             ->assert_is_persistable_var()
+                             ->assert_is_op_nth_input("mul", "Y", 0);
+  PDNode* mul_out_var{nullptr};
+  if (with_bias) {
+    // intermediate variable, will be removed in the IR after fuse.
+    mul_out_var = pattern->NewNode(name_scope, "mul_out")
+                      ->AsIntermediate()
+                      ->assert_is_only_output_of_op("mul")
+                      ->assert_is_op_input("elementwise_add");
+  }
+  PDNode *bias{nullptr}, *fc_out{nullptr};
+  if (with_bias) {
+    // bias
+    bias = pattern->NewNode(name_scope, "fc_bias")
+               ->assert_is_op_input("elementwise_add")
+               ->AsInput();
+    // output
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("elementwise_add");
+  } else {
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("mul");
+  }
+
+  if (with_bias) {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
+  } else {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
+  }
+
+  return fc_out;
+}
+PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
+                       PDNode* x) {
+  x->assert_is_op_input("lstm", "Input");
+  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
+#define NEW_NODE(arg__, io__)                        \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__("lstm", #arg__);
+
+  // Currently, the H0 and C0 are optional
+  // TODO(Superjomn) upgrade the fuse framework to support optional.
+  // NEW_NODE(H0, input);
+  // NEW_NODE(C0, input);
+  NEW_NODE(Weight, input);
+  NEW_NODE(Bias, input);
+
+  NEW_NODE(Hidden, output);
+  NEW_NODE(Cell, output);
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchCellPreAct, output);
+
+  lstm_op->LinksFrom({x, Weight, Bias});
+  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
+  return Hidden;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index f8488c84962d1caa6e7817b3c0349d6da3a59182..024ce8ce55616cc5e0eaced4a27a6e1fb004af2c 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -95,7 +95,11 @@ struct PDNode {
   PDNode* assert_var_not_persistable();
   PDNode* assert_is_persistable_var();
   PDNode* assert_is_op_output(const std::string& op_type);
+  PDNode* assert_is_op_output(const std::string& op_type,
+                              const std::string& argument);
   PDNode* assert_is_op_input(const std::string& op_type);
+  PDNode* assert_is_op_input(const std::string& op_type,
+                             const std::string& argument);
   PDNode* assert_is_op_nth_input(const std::string& op_type,
                                  const std::string& argument, int nth);
   PDNode* assert_is_op_nth_output(const std::string& op_type,
@@ -167,6 +171,9 @@ class PDPattern {
 
   PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
   PDNode* NewNode(const std::string& name = NewID());
+  PDNode* NewNode(const std::string& prefix, const std::string& name) {
+    return NewNode(prefix + "/" + name);
+  }
   PDNode* RetrieveNode(const std::string& id) const;
 
   const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
@@ -257,64 +264,36 @@ class GraphPatternDetector {
 
 // some helper methods.
 
-// Op's input.
-static bool VarLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Op's output.
-static bool VarLinksFromOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->inputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if a var links to an Op
+bool VarLinksToOp(Node* node, const std::string& op_type);
+
+// Tell if an op links to a var
+bool VarLinksFromOp(Node* node, const std::string& op_type);
 
 // Check whether a var node is a op node's nth input.
-static bool IsNthInput(Node* var, Node* op, const std::string& argument,
-                       size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Input(argument)[nth];
-}
-
-static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
-                        size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Output(argument)[nth];
-}
-
-static void GraphSafeRemoveNodes(Graph* graph,
-                                 const std::unordered_set<const Node*>& nodes) {
-  for (auto* node : nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
-}
+// Tell whether a var node is a op node's nth output.
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
+
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes);
+
+// Some pre-defined patterns those can be reused in multiple passes.
+namespace patterns {
+
+// FC with bias
+// op: mul + elementwise_add
+// named nodes:
+// mul, elementwise_add
+// w, mul_out, bias, fc_out
+PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
+           bool with_bias);
+
+PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+
+}  // namespace patterns
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 8d885cb9e4ee6e01de386b0f22423988dbe60ca6..e64916a5bb662e3b00cfe212f0bbbc537c7bc2cc 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
   marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
 };
 
+static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index a4319ffabb04f39437b76d97845e021ef9de66d3..8c290bb095d554a973e66a3a19606a06759fd668 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -49,7 +49,7 @@ struct TensorCopyVisitor {
         size_(size) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     // TODO(Yancey1989): support other place
     platform::CPUPlace cpu;
     memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index ab693004cfb038fd92afd9c60e0fcb4e16b9f8a9..05c4a17a01c6fabe48f3fe18544c13153feb0673 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -149,7 +149,7 @@ struct AnyDTypeVisitor {
       : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto t = EigenVector<T>::Flatten(tensor_);
     auto o = EigenScalar<bool>::From(*out_);
     // return any of predicate_(t) is true.
@@ -302,7 +302,7 @@ struct DeserializedDataFunctor {
       : buf_(buf), tensor_(tensor), place_(place) {}
 
   template <typename T>
-  void operator()() {
+  void apply() {
     *buf_ = tensor_->mutable_data<T>(place_);
   }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a4f6364ae5b7d832096c92e9c6d8b3e865713cff..86392078b356df774fbc47aed9214e9f10fe33be 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,9 +17,9 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 
-if(NOT APPLE)
+#if(APPLE)
   add_subdirectory(api)
-endif()
+#endif()
 
 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 779ede5e460d0ceb6fd404c4a32374f9f9d92088..cc0dd0d492d42e9552c9ce081e268330599104f0 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -25,9 +25,8 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS EXTRA_DEPS)
+        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
         set(mem_opt "")
         if(WITH_GPU)
             set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
@@ -35,28 +34,25 @@ function (inference_analysis_test TARGET)
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
                 DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
 
-set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
-set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
-set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
-set(DITU_RNN_MODEL ${DITU_INSTALL_DIR}/model)
-set(DITU_RNN_DATA ${DITU_INSTALL_DIR}/data.txt)
-
-function (inference_download_and_uncompress target url gz_filename)
+function (inference_download_and_uncompress install_dir url gz_filename)
     message(STATUS "Download inference test stuff ${gz_filename} from ${url}")
-    execute_process(COMMAND bash -c "mkdir -p ${DITU_INSTALL_DIR}")
-    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && wget -q ${url}")
-    execute_process(COMMAND bash -c "cd ${DITU_INSTALL_DIR} && tar xzf ${gz_filename}")
+    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${gz_filename}")
     message(STATUS "finish downloading ${gz_filename}")
 endfunction(inference_download_and_uncompress)
 
-if (NOT EXISTS ${DITU_INSTALL_DIR})
-    inference_download_and_uncompress(ditu_rnn_model ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
-    inference_download_and_uncompress(ditu_rnn_data ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
+set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
+set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
+set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
+if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
+  inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
+  inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
 endif()
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
@@ -73,8 +69,7 @@ inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
     attention_lstm_fuse_pass
     paddle_inference_api
 		pass
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
+    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
         --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
@@ -87,3 +82,29 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_
 inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
 inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
 inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
+
+set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
+set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
+set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING)
+  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
+  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
+set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
+set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING)
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index e6e63544ffa2de09e39b02769aaaf0793d6b1111..192ac2daa6a78efec6db19870f71e80593c62da9 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -109,6 +109,7 @@ void Analyzer::Run(Argument* argument) {
                     "infer_clean_graph_pass", "graph_viz_pass",    //
                     "attention_lstm_fuse_pass", "graph_viz_pass",  //
                     "fc_lstm_fuse_pass", "graph_viz_pass",         //
+                    "mul_lstm_fuse_pass", "graph_viz_pass",        //
                     "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
                     "fc_fuse_pass", "graph_viz_pass"               //
 
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2f7253ac04cac8457fa60a055e4ef2770aa874b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path for LAC");
+DEFINE_string(infer_data, "", "data file for LAC");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct DataRecord {
+  std::vector<int64_t> data;
+  std::vector<size_t> lod;
+  // for dataset and nextbatch
+  size_t batch_iter{0};
+  std::vector<std::vector<size_t>> batched_lods;
+  std::vector<std::vector<int64_t>> batched_datas;
+  std::vector<std::vector<int64_t>> datasets;
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+    batch_iter = 0;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    datasets.resize(0);
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      std::vector<int64_t> words_ids;
+      split_to_int64(data[1], ' ', &words_ids);
+      datasets.emplace_back(words_ids);
+    }
+  }
+  void Prepare(int bs) {
+    if (bs == 1) {
+      batched_datas = datasets;
+      for (auto one_sentence : datasets) {
+        batched_lods.push_back({0, one_sentence.size()});
+      }
+    } else {
+      std::vector<int64_t> one_batch;
+      std::vector<size_t> lod{0};
+      int bs_id = 0;
+      for (auto one_sentence : datasets) {
+        bs_id++;
+        one_batch.insert(one_batch.end(), one_sentence.begin(),
+                         one_sentence.end());
+        lod.push_back(lod.back() + one_sentence.size());
+        if (bs_id == bs) {
+          bs_id = 0;
+          batched_datas.push_back(one_batch);
+          batched_lods.push_back(lod);
+          one_batch.clear();
+          one_batch.resize(0);
+          lod.clear();
+          lod.resize(0);
+          lod.push_back(0);
+        }
+      }
+      if (one_batch.size() != 0) {
+        batched_datas.push_back(one_batch);
+        batched_lods.push_back(lod);
+      }
+    }
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    data.data = batched_datas[batch_iter];
+    data.lod = batched_lods[batch_iter];
+    batch_iter++;
+    if (batch_iter >= batched_datas.size()) {
+      batch_iter = 0;
+    }
+    return data;
+  }
+};
+void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                 int batch_size) {
+  auto one_batch = data->NextBatch();
+  PaddleTensor input_tensor;
+  input_tensor.name = "word";
+  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
+  input_tensor.lod.assign({one_batch.lod});
+  input_tensor.dtype = PaddleDType::INT64;
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  input_slots->assign({input_tensor});
+}
+static void PrintTime(const double latency, const int bs, const int repeat) {
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
+            << ", avg latency: " << latency / repeat << "ms";
+  LOG(INFO) << "=====================================";
+}
+void BenchAllData(const std::string &model_path, const std::string &data_file,
+                  const int batch_size, const int repeat) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  GetOneBatch(&input_slots, &data, batch_size);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  double sum = 0;
+  for (int i = 0; i < repeat; i++) {
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      timer.tic();
+      predictor->Run(input_slots, &outputs_slots);
+      sum += timer.toc();
+    }
+  }
+  PrintTime(sum, batch_size, repeat);
+}
+const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
+                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
+                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
+                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+void TestLACPrediction(const std::string &model_path,
+                       const std::string &data_file, const int batch_size,
+                       const int repeat, bool test_all_data) {
+  if (test_all_data) {
+    BenchAllData(model_path, data_file, batch_size, repeat);
+    return;
+  }
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < repeat; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  PrintTime(timer.toc(), batch_size, repeat);
+  EXPECT_EQ(outputs_slots.size(), 1UL);
+  auto &out = outputs_slots[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+  PADDLE_ENFORCE_GT(size, 0);
+  EXPECT_GE(size, batch1_size);
+  int64_t *pdata = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < batch1_size; ++i) {
+    EXPECT_EQ(pdata[i], lac_ref_data[i]);
+  }
+}
+TEST(Analyzer_LAC, native) {
+  LOG(INFO) << "LAC with native";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..720a8811db75a91a5774a29dd95285eceabadf83
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 10, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> word_data_all, mention_data_all;
+  std::vector<std::vector<int64_t>> rnn_word_datas, rnn_mention_datas;
+  std::vector<size_t> lod;  // two inputs have the same lod info.
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= word_data_all.size()) {
+      data.word_data_all.assign(word_data_all.begin() + batch_iter,
+                                word_data_all.begin() + batch_end);
+      data.mention_data_all.assign(mention_data_all.begin() + batch_iter,
+                                   mention_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.word_data_all.empty());
+      CHECK(!data.mention_data_all.empty());
+      CHECK_EQ(data.word_data_all.size(), data.mention_data_all.size());
+      for (size_t j = 0; j < data.word_data_all.size(); j++) {
+        data.rnn_word_datas.push_back(data.word_data_all[j]);
+        data.rnn_mention_datas.push_back(data.mention_data_all[j]);
+        // calculate lod
+        data.lod.push_back(data.lod.back() + data.word_data_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      // load word data
+      std::vector<int64_t> word_data;
+      split_to_int64(data[1], ' ', &word_data);
+      // load mention data
+      std::vector<int64_t> mention_data;
+      split_to_int64(data[3], ' ', &mention_data);
+      word_data_all.push_back(std::move(word_data));
+      mention_data_all.push_back(std::move(mention_data));
+    }
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor lod_word_tensor, lod_mention_tensor;
+  lod_word_tensor.name = "word";
+  lod_mention_tensor.name = "mention";
+  auto one_batch = data->NextBatch();
+  int size = one_batch.lod[one_batch.lod.size() - 1];  // token batch size
+  lod_word_tensor.shape.assign({size, 1});
+  lod_word_tensor.lod.assign({one_batch.lod});
+  lod_mention_tensor.shape.assign({size, 1});
+  lod_mention_tensor.lod.assign({one_batch.lod});
+  // assign data
+  TensorAssignData<int64_t>(&lod_word_tensor, one_batch.rnn_word_datas);
+  TensorAssignData<int64_t>(&lod_mention_tensor, one_batch.rnn_mention_datas);
+  // Set inputs.
+  input_slots->assign({lod_word_tensor, lod_mention_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+// the first inference result
+const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
+                                       48, 39, 38, 16, 25};
+
+void TestChineseNERPrediction() {
+  NativeConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  // Prepare inputs.
+  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+  std::vector<PaddleTensor> outputs;
+
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << FLAGS_batch_size
+            << ", repeat: " << FLAGS_repeat
+            << ", latency: " << timer.toc() / FLAGS_repeat << "ms";
+  LOG(INFO) << "=====================================";
+
+  PADDLE_ENFORCE(outputs.size(), 1UL);
+  auto &out = outputs[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  PADDLE_ENFORCE_GT(size, 0);
+  int64_t *result = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < std::min(11UL, size); i++) {
+    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
+  }
+}
+
+// Directly infer with the original model.
+TEST(Analyzer, Chinese_ner) { TestChineseNERPrediction(); }
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 2cc83c777ce7a92d4e00a95d2d2c40753e2ffbcf..ec1f3979a74bd86ee7402bca441e95d3d177d113 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -34,7 +34,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-using namespace framework;
+using namespace framework;  // NOLINT
 
 TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
@@ -201,13 +201,13 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   minute_tensor.lod.assign({one_batch.lod3});
   // clang-format on
   // assign data
-  TensorAssignData(&lod_attention_tensor,
-                   std::vector<std::vector<float>>({{0, 0}}));
+  TensorAssignData<float>(&lod_attention_tensor,
+                          std::vector<std::vector<float>>({{0, 0}}));
   std::vector<float> tmp_zeros(batch_size * 15, 0.);
-  TensorAssignData(&init_zero_tensor, {tmp_zeros});
-  TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data);
-  TensorAssignData(&week_tensor, one_batch.rnn_week_datas);
-  TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas);
+  TensorAssignData<float>(&init_zero_tensor, {tmp_zeros});
+  TensorAssignData<float>(&lod_tensor_tensor, one_batch.rnn_link_data);
+  TensorAssignData<float>(&week_tensor, one_batch.rnn_week_datas);
+  TensorAssignData<float>(&minute_tensor, one_batch.rnn_minute_datas);
   // Set inputs.
   auto init_zero_tensor1 = init_zero_tensor;
   init_zero_tensor1.name = "hidden_init";
@@ -312,8 +312,8 @@ void TestDituRNNPrediction(const std::string &model_path,
     PADDLE_ENFORCE_GT(size, 0);
     float *data = static_cast<float *>(out.data.data());
     float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    for (size_t j = 0; j < size; j++) {
+      EXPECT_NEAR(data[j], base_data[j], 1e-3);
     }
   }
 
@@ -329,6 +329,7 @@ void TestDituRNNPrediction(const std::string &model_path,
 
     ASSERT_TRUE(fuse_statis.count("fc"));
     EXPECT_EQ(fuse_statis.at("fc"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 1);
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 3a4ffe967e67ab0487192bbf12d4d5a15f536aa3..e8fb0775b45761f64fd6fd28306c35b76d1e40c4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -67,7 +67,7 @@ struct Argument {
     PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
                    key);
     attrs_[key] = data;
-    attr_deleters_[key] = [data, key, this]() {
+    attr_deleters_[key] = [data, key]() {
       VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
       VLOG(3) << "argument delete attr: " << key;
       delete data;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index da1c0b1fbc9b2d025422deb11368e4070bffdc74..7de97c69e7eb897903afa107488286e21e70f505 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -69,7 +69,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
       feed_names_[op->Output("Out")[0]] = idx;
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= idx) {
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
         fetchs_.resize(idx + 1);
       }
       fetchs_[idx] = op;
@@ -179,8 +179,13 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return cls;
+#else
   // fix manylinux compile error.
   return std::move(cls);
+#endif
 }
 
 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
@@ -222,6 +227,62 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
   }
   return true;
 }
+template <typename T>
+void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
+                                        PaddleTensor *output) {
+  std::vector<int> shape;
+  auto dims_i = fetch.dims();
+  auto lod = fetch.lod();
+  const T *output_ptr = fetch.data<T>();
+  auto num = fetch.numel();
+  std::vector<T> data;
+  if (0 == lod.size()) {
+    std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+    for (int j = 0; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  } else {
+    // for batch detection
+    // image[0] -> output[0] shape {145, 6}
+    // image[1] -> output[1] shape {176, 6}
+    // then,
+    // the batch output shape {321, 6}
+    // the lod {{0, 145, 321}}
+    // so we should append output[0] to {176, 6}
+    size_t max_dim = 0;
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+    }
+    size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+    if (max_dim > 0) {
+      data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+    }
+    for (size_t j = 1; j < lod[0].size(); j++) {
+      size_t start = lod[0][j - 1] * common_dim;
+      size_t end = lod[0][j] * common_dim;
+      if (end > start) {
+        std::copy(output_ptr + start, output_ptr + end,
+                  data.begin() + (j - 1) * max_dim * common_dim);
+      }
+    }
+    shape.push_back(lod[0].size() - 1);
+    shape.push_back(max_dim);
+    for (int j = 1; j < dims_i.size(); ++j) {
+      shape.push_back(dims_i[j]);
+    }
+  }
+
+  output->shape = shape;
+  auto &buffer = output->data;
+  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
+    buffer.Resize(sizeof(T) * data.size());
+  }
+  std::memcpy(buffer.data(), data.data(), buffer.length());
+  // copy LoD
+  for (const auto &level : fetch.lod()) {
+    output->lod.emplace_back(level);
+  }
+}
 
 bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                      framework::Scope *scope) {
@@ -229,70 +290,20 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE(idx == i);
-    framework::LoDTensor &output =
+    PADDLE_ENFORCE((size_t)idx == i);
+    framework::LoDTensor &fetch =
         framework::GetFetchVariable(*scope, "fetch", idx);
-    // TODO(panyx0718): Support fetch of other types.
-    if (output.type() != typeid(float)) {
-      LOG(ERROR) << "only support fetching float now.";
-      return false;
-    }
-
-    std::vector<int> shape;
-    auto dims_i = output.dims();
-    auto lod = output.lod();
-    const float *output_ptr = output.data<float>();
-    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
-    auto num = output.numel();
-    std::vector<float> data;
-    if (0 == lod.size()) {
-      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
-      for (int j = 0; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
+    auto type = fetch.type();
+    auto output = &(outputs->at(i));
+    if (type == typeid(float)) {
+      GetFetchOne<float>(fetch, output);
+      output->dtype = PaddleDType::FLOAT32;
+    } else if (type == typeid(int64_t)) {
+      GetFetchOne<int64_t>(fetch, output);
+      output->dtype = PaddleDType::INT64;
     } else {
-      // for batch detection
-      // image[0] -> output[0] shape {145, 6}
-      // image[1] -> output[1] shape {176, 6}
-      // then,
-      // the batch output shape {321, 6}
-      // the lod {{0, 145, 321}}
-      // so we should append output[0] to {176, 6}
-      size_t max_dim = 0;
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
-      }
-      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
-      if (max_dim > 0) {
-        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
-      }
-      for (size_t j = 1; j < lod[0].size(); j++) {
-        size_t start = lod[0][j - 1] * common_dim;
-        size_t end = lod[0][j] * common_dim;
-        if (end > start) {
-          std::copy(output_ptr + start, output_ptr + end,
-                    data.begin() + (j - 1) * max_dim * common_dim);
-        }
-      }
-      shape.push_back(lod[0].size() - 1);
-      shape.push_back(max_dim);
-      for (int j = 1; j < dims_i.size(); ++j) {
-        shape.push_back(dims_i[j]);
-      }
-    }
-
-    outputs->at(i).shape = shape;
-    auto &buffer = outputs->at(i).data;
-    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
-      buffer.Resize(sizeof(float) * data.size());
-    }
-    std::memcpy(buffer.data(), data.data(), buffer.length());
-    // copy LoD
-    for (const auto &level : output.lod()) {
-      outputs->at(i).lod.emplace_back(level);
+      LOG(ERROR) << "unknown type, only support float32 and int64 now.";
     }
-    outputs->at(i).dtype = PaddleDType::FLOAT32;
-    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
   }
   return true;
 }
@@ -323,7 +334,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return predictor;
+#else
   return std::move(predictor);
+#endif
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4eff9204eba987aed11e4066fa7b6f6cc610a763..ec801c58857e716241d28404510530e551ed25aa 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -51,7 +51,9 @@ class NativePaddlePredictor : public PaddlePredictor {
                framework::Scope *scope);
   bool GetFetch(std::vector<PaddleTensor> *output_data,
                 framework::Scope *scope);
-
+  template <typename T>
+  void GetFetchOne(const framework::LoDTensor &fetchs,
+                   PaddleTensor *output_data);
   void PrepareFeedFetch();
 
   NativeConfig config_;
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e44b1b74bc385c015fa6efcebac05359a810cbc1..19832e890915fc45b1d3e76e39e685e1878383d7 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -16,6 +16,7 @@
 
 #include <sys/time.h>
 #include <algorithm>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -68,6 +69,13 @@ static void split_to_float(const std::string &str, char sep,
   std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
                  [](const std::string &v) { return std::stof(v); });
 }
+static void split_to_int64(const std::string &str, char sep,
+                           std::vector<int64_t> *is) {
+  std::vector<std::string> pieces;
+  split(str, sep, &pieces);
+  std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
+                 [](const std::string &v) { return std::stoi(v); });
+}
 template <typename T>
 std::string to_string(const std::vector<T> &vec) {
   std::stringstream ss;
@@ -84,14 +92,18 @@ template <>
 std::string to_string<std::vector<std::vector<float>>>(
     const std::vector<std::vector<std::vector<float>>> &vec);
 
-// clang-format off
-static void TensorAssignData(PaddleTensor *tensor, const std::vector<std::vector<float>> &data) {
+template <typename T>
+static void TensorAssignData(PaddleTensor *tensor,
+                             const std::vector<std::vector<T>> &data) {
   // Assign buffer
-  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1, [](int a, int b) { return a * b; });
-  tensor->data.Resize(sizeof(float) * dim);
+  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  tensor->data.Resize(sizeof(T) * dim);
   int c = 0;
   for (const auto &f : data) {
-    for (float v : f) { static_cast<float *>(tensor->data.data())[c++] = v; }
+    for (T v : f) {
+      static_cast<T *>(tensor->data.data())[c++] = v;
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 10d678111f5325e495b24286e6ecf651230393fe..b6cb935814e25b31d4104f9ce24fe952680cb491 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -74,7 +74,7 @@ struct BeamSearchDecodeFunctor {
   }
 
   template <typename T>
-  void operator()() const;
+  void apply() const;
 
   bool tensor_on_gpu_;
   size_t beam_size_;
@@ -88,7 +88,7 @@ struct BeamSearchDecodeFunctor {
 };
 
 template <typename T>
-void BeamSearchDecodeFunctor::operator()() const {
+void BeamSearchDecodeFunctor::apply() const {
   BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
   // Check if the tensor is on GPU. If so, use the CPU copy instead
   if (tensor_on_gpu_) {
@@ -101,7 +101,7 @@ void BeamSearchDecodeFunctor::operator()() const {
 }
 
 template <>
-void BeamSearchDecodeFunctor::operator()<bool>() const {
+void BeamSearchDecodeFunctor::apply<bool>() const {
   PADDLE_THROW("beam search decode op does not support bool!");
 }
 
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 6220e57f5941d89cbf0aea268b85ad55af6132cc..8fa0416049f8fa128d7ab61f8350b41960f07263 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -37,7 +37,7 @@ struct CastOpFunctor {
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* in_begin = in_->data<InT>();
     auto numel = in_->numel();
     auto* in_end = in_begin + numel;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index d29b0153389574de8992b93ac6795e91556af870..fcdcafae7273afa6887ee531dfc37ef833b92d68 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -33,7 +33,7 @@ struct AppendProposalsFunctor {
       : out_(out), offset_(offset), to_add_(to_add) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto *out_data = out_->data<T>();
     auto *to_add_data = to_add_->data<T>();
     memcpy(out_data + offset_, to_add_data, to_add_->numel() * sizeof(T));
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index 925dc19061e2196a40411f415eb6e5ad59ab52ff..adc7cb1f9e48ba5fabeb91c5e3ecec016db34a45 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -25,7 +25,7 @@ struct FillOpVisitor {
       : tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     platform::CPUPlace cpu;
     auto *data = tensor_->mutable_data<T>(cpu);
     std::transform(value_.data(), value_.data() + tensor_->numel(), data,
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7..b3f7e0c0097b469998049a1db65d56a28cf02b5e 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 1b75df5d7d97e54dfdc461660e53a368311e3778..d7f0f3c6280db7d121bf8821ec6d578e22a33da6 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,4 +1,6 @@
+if (NOT WIN32)
 add_subdirectory(detail)
+endif(NOT WIN32)
 
 function(math_library TARGET)
     # math_library is a function to create math library. 
@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
-math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
+
+if (NOT WIN32) # windows do not support avx functions yet.
+math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+endif (NOT WIN32)
+
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
@@ -51,7 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
 math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 9a6e646b28fdec78734eb4e7b98c8acf688b2645..5923792902a81521256de300f77955f1ea3d16c6 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -55,7 +55,7 @@ struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
-  void operator()() const {
+  void apply() const {
     auto cpu = platform::CPUPlace();
     auto* begin = tensor_->mutable_data<T>(cpu);
     std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 12d1baa8fb544a8b9684e43204c61ba410d1b295..79b7538ad05b0ff348b8264d50b63211b5254e80 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -52,7 +52,7 @@ struct TensorSetConstantGPU {
       : context_(context), tensor_(tensor), value_(value) {}
 
   template <typename T>
-  void operator()() const {
+  void apply() const {
     SetConstant<platform::CUDADeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
             tensor_, static_cast<T>(value_));
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 625065692c1f32c89d9e566d00051e237ac9a3af..59d8b9b8a8d554eb16826712ff634eed5df2d648 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -41,7 +41,7 @@ struct OneHotOpCUDAFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index 7e77f25089c4bd0297b0eb5a0ed7555cc0af5a9f..1ebd2676496940ff8f90caaaded5c8227bd7ae78 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -31,7 +31,7 @@ struct OneHotOpFunctor {
       : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
 
   template <typename OutT>
-  void operator()() const {
+  void apply() const {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a706d05fd7c35ef993f5199f0f893622cb863c5d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -0,0 +1,584 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+void Pad2DConstNCHW(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          out_data[out_h * out_width + out_w] =
+              (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+                  ? value
+                  : in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DConstNHWC(const T* in_data, const int num, const int channels,
+                    const int in_height, const int in_width,
+                    const int out_height, const int out_width,
+                    const int pad_top, const int pad_left, T value,
+                    T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width) {
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = value;
+          }
+        } else {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            out_data[out_index + c] = in_data[in_index + c];
+          }
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DReflectNCHW(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect by 0
+          in_h =
+              std::min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+          in_w = std::max(in_w, -in_w);                  // reflect by 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect by in_width
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DReflectNHWC(const T* in_data, const int num, const int channels,
+                      const int in_height, const int in_width,
+                      const int out_height, const int out_width,
+                      const int pad_top, const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNCHW(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          out_data[out_h * out_width + out_w] = in_data[in_h * in_width + in_w];
+        }
+      }
+      in_data += in_height * in_width;
+      out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DEdgeNHWC(const T* in_data, const int num, const int channels,
+                   const int in_height, const int in_width,
+                   const int out_height, const int out_width, const int pad_top,
+                   const int pad_left, T* out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          out_data[out_index + c] = in_data[in_index + c];
+        }
+      }
+    }
+    in_data += in_height * in_width * channels;
+    out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNCHW(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          if (!(in_h < 0 || in_w < 0 || in_h >= in_height ||
+                in_w >= in_width)) {
+            d_in_data[in_h * in_width + in_w] =
+                d_out_data[out_h * out_width + out_w];
+          }
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradConstNHWC(T* d_in_data, const int num, const int channels,
+                        const int in_height, const int in_width,
+                        const int out_height, const int out_width,
+                        const int pad_top, const int pad_left,
+                        const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        const int out_index = (out_h * out_width + out_w) * channels;
+        if (!(in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)) {
+          const int in_index = (in_h * in_width + in_w) * channels;
+          for (int c = 0; c < channels; ++c) {
+            d_in_data[in_index + c] = d_out_data[out_index + c];
+          }
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNCHW(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = out_h - pad_top;
+          int in_w = out_w - pad_left;
+          in_h = std::max(in_h, -in_h);  // reflect over 0
+          in_h = std::min(in_h,
+                          2 * in_height - in_h - 2);  // reflect over in_height
+          in_w = std::max(in_w, -in_w);               // reflect over 0
+          in_w =
+              std::min(in_w, 2 * in_width - in_w - 2);  // reflect over in_width
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradReflectNHWC(T* d_in_data, const int num, const int channels,
+                          const int in_height, const int in_width,
+                          const int out_height, const int out_width,
+                          const int pad_top, const int pad_left,
+                          const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = out_h - pad_top;
+        int in_w = out_w - pad_left;
+        in_h = std::max(in_h, -in_h);
+        in_h = std::min(in_h, 2 * in_height - in_h - 2);
+        in_w = std::max(in_w, -in_w);
+        in_w = std::min(in_w, 2 * in_width - in_w - 2);
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNCHW(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < out_height; ++out_h) {
+        for (int out_w = 0; out_w < out_width; ++out_w) {
+          int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+          int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+          d_in_data[in_h * in_width + in_w] +=
+              d_out_data[out_h * out_width + out_w];
+        }
+      }
+      d_in_data += in_height * in_width;
+      d_out_data += out_height * out_width;
+    }
+  }
+}
+
+template <typename T>
+void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
+                       const int in_height, const int in_width,
+                       const int out_height, const int out_width,
+                       const int pad_top, const int pad_left,
+                       const T* d_out_data) {
+  for (int n = 0; n < num; ++n) {
+    for (int out_h = 0; out_h < out_height; ++out_h) {
+      for (int out_w = 0; out_w < out_width; ++out_w) {
+        const int out_index = (out_h * out_width + out_w) * channels;
+        int in_h = std::min(in_height - 1, std::max(out_h - pad_top, 0));
+        int in_w = std::min(in_width - 1, std::max(out_w - pad_left, 0));
+        const int in_index = (in_h * in_width + in_w) * channels;
+        for (int c = 0; c < channels; ++c) {
+          d_in_data[in_index + c] += d_out_data[out_index + c];
+        }
+      }
+    }
+    d_in_data += in_height * in_width * channels;
+    d_out_data += out_height * out_width * channels;
+  }
+}
+
+template <typename T>
+class Pad2dCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC(in_data, num, channels, in_height, in_width,
+                         out_height, out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC(in_data, num, channels, in_height, in_width, out_height,
+                      out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC(in_data, num, channels, in_height, in_width, out_height,
+                       out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CPUDeviceContext>(),
+             d_in, static_cast<T>(0));
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNCHW(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC(d_in_data, num, channels, in_height, in_width,
+                             out_height, out_width, pad_top, pad_left,
+                             d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC(d_in_data, num, channels, in_height, in_width,
+                          out_height, out_width, pad_top, pad_left, d_out_data);
+      } else {
+        Pad2DGradConstNHWC(d_in_data, num, channels, in_height, in_width,
+                           out_height, out_width, pad_top, pad_left,
+                           d_out_data);
+      }
+    }
+  }
+};
+
+class Pad2dOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Pad2dOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Pad2dOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4,
+                      "Size of paddings should be equal to 4.");
+    std::vector<int64_t> out_dims(x_dim.size());
+
+    auto data_format = ctx->Attrs().Get<std::string>("data_format");
+    out_dims[0] = x_dim[0];
+    if (data_format == "NCHW") {
+      out_dims[1] = x_dim[1];
+      out_dims[2] = x_dim[2] + paddings[0] + paddings[1];  // height
+      out_dims[3] = x_dim[3] + paddings[2] + paddings[3];  // width
+    } else {                                               // NHWC
+      out_dims[3] = x_dim[3];
+      out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
+      out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input of pad2d op. "
+             "The input should be a 4-D tensor with formate NCHW or NHWC.");
+    AddOutput("Out",
+              "The output of pad2d op. "
+              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules."
+        "paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings must be 4.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas in constant mode.")
+        .SetDefault(0.0f);
+    AddAttr<std::string>("mode",
+                         "(float, default constant) "
+                         "Three modes: constant(default), reflect, edge.")
+        .SetDefault("constant");
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the input data.")
+        .SetDefault("NCHW");
+    AddComment(R"DOC(
+Pad2d Operator.
+Pad 2-d images accordding to 'paddings' and 'mode'. 
+If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+than height-1. And the width dimension has the same condition.
+
+Given that X is a channel of image from input:
+
+X = [[1, 2, 3],
+     [4, 5, 6]]
+
+Case 0:
+
+paddings = [0, 1, 2, 3],
+mode = 'constant'
+pad_value = 0
+
+Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+       [0, 0, 4, 5, 6, 0, 0, 0]
+       [0, 0, 0, 0, 0, 0, 0, 0]]
+
+Case 1:
+
+paddings = [0, 1, 2, 1],
+mode = 'reflect'
+
+Out = [[3, 2, 1, 2, 3, 2]
+       [6, 5, 4, 5, 6, 5]
+       [3, 2, 1, 2, 3, 2]]
+
+Case 2:
+
+paddings = [0, 1, 2, 1],
+mode = 'edge'
+
+Out = [[1, 1, 1, 2, 3, 3]
+       [4, 4, 4, 5, 6, 6]
+       [4, 4, 4, 5, 6, 6]]
+)DOC");
+  }
+};
+
+class Pad2dOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad2d_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker,
+                  ops::Pad2dOpGradMaker);
+REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad);
+REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel<float>);
+REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel<float>);
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ba0ddbd84a43cfd5f028ce072b5c7606fae343d
--- /dev/null
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -0,0 +1,432 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
+                               const int num, const int channels,
+                               const int in_height, const int in_width,
+                               const int out_height, const int out_width,
+                               const int pad_top, const int pad_left, T value,
+                               T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = out_h - pad_top;
+    const int in_w = out_w - pad_left;
+    out_data[index] =
+        (in_h < 0 || in_w < 0 || in_h >= in_height || in_w >= in_width)
+            ? value
+            : in_data[((n * in_height + in_h) * in_width + in_w) * channels +
+                      c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);                     // reflect by 0
+    in_h = min(in_h, 2 * in_height - in_h - 2);  // reflect by in_height
+    in_w = max(in_w, -in_w);                     // reflect by 0
+    in_w = min(in_w, 2 * in_width - in_w - 2);   // reflect by in_width
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
+                                 const int num, const int channels,
+                                 const int in_height, const int in_width,
+                                 const int out_height, const int out_width,
+                                 const int pad_top, const int pad_left,
+                                 T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = max(in_w, -in_w);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int nc = index / out_width;
+    const int out_w = index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] = in_data[(nc * in_height + in_h) * in_width + in_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
+                              const int num, const int channels,
+                              const int in_height, const int in_width,
+                              const int out_height, const int out_width,
+                              const int pad_top, const int pad_left,
+                              T* out_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / channels;
+    const int c = index % channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    out_data[index] =
+        in_data[((n * in_height + in_h) * in_width + in_w) * channels + c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int nc = in_index / in_width;
+    const int out_w = in_index % in_width + pad_left;
+    const int out_h = nc % in_height + pad_top;
+    nc /= in_height;
+    d_in_data[in_index] =
+        d_out_data[(nc * out_height + out_h) * out_width + out_w];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
+                                   const int num, const int channels,
+                                   const int in_height, const int in_width,
+                                   const int out_height, const int out_width,
+                                   const int pad_top, const int pad_left,
+                                   const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+    int n = in_index / channels;
+    const int c = in_index % channels;
+    const int out_w = n % in_width + pad_left;
+    n /= in_width;
+    const int out_h = n % in_height + pad_top;
+    n /= in_height;
+    d_in_data[in_index] =
+        d_out_data[((n * out_height + out_h) * out_width + out_w) * channels +
+                   c];
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, 2 * in_height - in_h - 2);
+    in_w = min(in_w, 2 * in_width - in_w - 2);
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
+                                     const int num, const int channels,
+                                     const int in_height, const int in_width,
+                                     const int out_height, const int out_width,
+                                     const int pad_top, const int pad_left,
+                                     const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    int in_h = out_h - pad_top;
+    int in_w = out_w - pad_left;
+    in_h = max(in_h, -in_h);
+    in_w = max(in_w, -in_w);
+    in_h = min(in_h, in_height * 2 - in_h - 2);
+    in_w = min(in_w, in_width * 2 - in_w - 2);
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    int nc = out_index / out_width;
+    const int out_w = out_index % out_width;
+    const int out_h = nc % out_height;
+    nc /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+              d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+__global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
+                                  const int num, const int channels,
+                                  const int in_height, const int in_width,
+                                  const int out_height, const int out_width,
+                                  const int pad_top, const int pad_left,
+                                  const T* d_out_data) {
+  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+    const int c = out_index % channels;
+    int n = out_index / channels;
+    const int out_w = n % out_width;
+    n /= out_width;
+    const int out_h = n % out_height;
+    n /= out_height;
+    const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
+    const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
+    atomicAdd(
+        &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
+        d_out_data[out_index]);
+  }
+}
+
+template <typename T>
+class Pad2dCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    T value = context.Attr<T>("pad_value");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto in_dims = x->dims();
+    auto out_dims = out->dims();
+    const T* in_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = out->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = in_dims[1];
+      const int in_height = in_dims[2];
+      const int in_width = in_dims[3];
+      const int out_height = out_dims[2];
+      const int out_width = out_dims[3];
+      if (mode == "reflect") {
+        Pad2DReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    } else {
+      const int channels = in_dims[3];
+      const int in_height = in_dims[1];
+      const int in_width = in_dims[2];
+      const int out_height = out_dims[1];
+      const int out_width = out_dims[2];
+      if (mode == "reflect") {
+        Pad2DReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else if (mode == "edge") {
+        Pad2DEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, out_data);
+      } else {
+        Pad2DConstNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, value, out_data);
+      }
+    }
+  }
+};
+
+template <typename T>
+class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto pads = context.Attr<std::vector<int>>("paddings");
+    auto mode = context.Attr<std::string>("mode");
+    auto data_format = context.Attr<std::string>("data_format");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_in = context.Output<Tensor>(framework::GradVarName("X"));
+    auto d_in_dims = d_in->dims();
+    auto d_out_dims = d_out->dims();
+    const T* d_out_data = d_out->data<T>();
+    T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context.template device_context<platform::CUDADeviceContext>(),
+             d_in, static_cast<T>(0));
+
+    const int pad_top = pads[0];
+    const int pad_left = pads[2];
+    const int num = d_in_dims[0];
+
+    auto stream = context.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    const int out_size = d_out->numel();
+    const int in_size = d_in->numel();
+    int grid = (out_size + block - 1) / block;
+
+    if (data_format == "NCHW") {
+      const int channels = d_in_dims[1];
+      const int in_height = d_in_dims[2];
+      const int in_width = d_in_dims[3];
+      const int out_height = d_out_dims[2];
+      const int out_width = d_out_dims[3];
+      if (mode == "reflect") {
+        Pad2DGradReflectNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNCHW<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNCHW<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    } else {
+      const int channels = d_in_dims[3];
+      const int in_height = d_in_dims[1];
+      const int in_width = d_in_dims[2];
+      const int out_height = d_out_dims[1];
+      const int out_width = d_out_dims[2];
+      if (mode == "reflect") {
+        Pad2DGradReflectNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else if (mode == "edge") {
+        Pad2DGradEdgeNHWC<T><<<grid, block, 0, stream>>>(
+            out_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      } else {
+        grid = (in_size + block - 1) / block;
+        Pad2DGradConstNHWC<T><<<grid, block, 0, stream>>>(
+            in_size, d_in_data, num, channels, in_height, in_width, out_height,
+            out_width, pad_top, pad_left, d_out_data);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(pad2d, ops::Pad2dCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(pad2d_grad, ops::Pad2dGradCUDAKernel<float>);
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
index f9076cbc678534fd5490fa0d7adeac0e50909a39..12f1525594ecf0887618616ffe563bd2bda32496 100644
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
     auto dim = x->dims();
     int index = 0;
     int i = 0;
-    int temp = 0;
     if (mode == "channel") {
+      int temp = numel / (dim[0] * dim[1]);
       for (i = 0; i < numel; i++) {
-        temp = numel / (dim[0] * dim[1]);
         index = (i / temp) % dim[1];
         o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
       }
diff --git a/paddle/fluid/operators/sequence_mask_op.h b/paddle/fluid/operators/sequence_mask_op.h
index 0dd554adfe57e469c2fac17f27adae2db7003a6a..18acb735cecabd1e01f7821c880fd8ed5e52971f 100644
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
@@ -99,7 +99,7 @@ struct SequenceMaskFunctor {
       : ctx_(ctx), x_(x), y_(y), limits_(limits), maxlen_(maxlen) {}
 
   template <typename Ty>
-  void operator()() const {
+  void apply() const {
     auto *y_data = y_->mutable_data<Ty>(ctx_.GetPlace());
     platform::ForRange<DeviceContext> for_range(ctx_, limits_);
     for_range(SequenceMaskForRangeFunctor<Tx, Ty>(x_, y_data, maxlen_));
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 8146c5f56104b7dec86b1c4491ed10fc2e94b58b..29d2fb989754f5621222768a279a1c898ea1c355 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -62,7 +62,10 @@ class ShrinkRNNMemoryOp : public ArrayOp {
     }
 
     if (dst_num_rows != 0) {
-      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
+      out_tensor.mutable_data(place, x_tensor.type());
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      framework::TensorCopy(x_tensor.Slice(0, height), place, *dev_ctx,
+                            &out_tensor);
     }
   }
 };
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8ef7444a1a3b8ffd38c177ad943c008bd0aaf084..0ecfc958a3b89c85ef00574d630042d410c3fa0a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -109,6 +109,7 @@ __all__ = [
     'flatten',
     'sequence_mask',
     'stack',
+    'pad2d',
     'unstack',
 ]
 
@@ -5614,6 +5615,94 @@ def rank_loss(label, left, right, name=None):
     return out
 
 
+def pad2d(input,
+          paddings=[0, 0, 0, 0],
+          mode='constant',
+          pad_value=0.0,
+          data_format="NCHW",
+          name=None):
+    """
+    Pad 2-d images accordding to 'paddings' and 'mode'.
+    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+    than height-1. And the width dimension has the same condition.
+
+    Example:
+
+      Given that X is a channel of image from input:
+      
+      X = [[1, 2, 3],
+           [4, 5, 6]]
+      
+      Case 0:
+      
+        paddings = [0, 1, 2, 3],
+        mode = 'constant'
+        pad_value = 0
+        
+        Out = [[0, 0, 1, 2, 3, 0, 0, 0]
+               [0, 0, 4, 5, 6, 0, 0, 0]
+               [0, 0, 0, 0, 0, 0, 0, 0]]
+      
+      Case 1:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'reflect'
+        
+        Out = [[3, 2, 1, 2, 3, 2]
+               [6, 5, 4, 5, 6, 5]
+               [3, 2, 1, 2, 3, 2]]
+        
+      Case 2:
+      
+        paddings = [0, 1, 2, 1],
+        mode = 'edge'
+        
+        Out = [[1, 1, 1, 2, 3, 3]
+               [4, 4, 4, 5, 6, 6]
+               [4, 4, 4, 5, 6, 6]]
+    
+  
+    Args:
+        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
+        paddings (tuple|list): The padding size. If padding is a tuple, it must
+            contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
+            Default: padding = [0, 0, 0, 0].
+        mode (str): Three modes: constant(default), reflect, edge. Default: constant
+        pad_value (float32): The value to fill the padded areas in constant mode. Default: 0
+        data_format (str): An optional string from: "NHWC", "NCHW". Specify the data format of
+                           the input data.
+                           Default: "NCHW"
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable padded accordding to paddings and mode.
+
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect')
+    """
+
+    helper = LayerHelper('pad2d', **locals())
+    dtype = helper.input_dtype(input_param_name='input')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad2d',
+        inputs={'X': input},
+        outputs={"Out": out},
+        attrs={
+            'paddings': paddings,
+            'mode': mode,
+            'pad_value': pad_value,
+            'data_frmat': data_format
+        })
+
+    return out
+
+
 def prelu(x, mode, param_attr=None, name=None):
     """
     Equation:
@@ -5628,8 +5717,8 @@ def prelu(x, mode, param_attr=None, name=None):
 		       all: all elements share same weight
  		       channel:elements in a channel share same weight
  		       element:each element has a weight
-	  name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+	name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically. 
 
     Returns:
         Variable: The output tensor with the same shape as input.
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 1d9ab44ed447468fb8383c52747d14970ae27ced..b85501ef6b80d1f5004aa0dd08c3123d3bda48a5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         # 2 optimize for table adam
         # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "adam", "scale", "scale"])
+                         ["sum", "scale", "adam", "scale", "scale"])
 
         trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f2fccd5d76777b183823c684d5e5ace317925bb0..ecdf32524afb1357b192ce14674b7073972dee9f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -521,6 +521,20 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_pad2d(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            out = layers.pad2d(
+                input,
+                paddings=[1, 2, 3, 4],
+                mode='reflect',
+                data_format='NCHW',
+                name="shape")
+            self.assertIsNotNone(out)
+        print(str(program))
+
     def test_prelu(self):
         program = Program()
         with program_guard(program):
diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..728b8c181a4410d7df7f304bcc8d2816e91ea6d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPad2dOp(OpTest):
+    def setUp(self):
+        self.pad_value = 0.0
+        self.initTestCase()
+        self.op_type = "pad2d"
+        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        self.attrs['mode'] = self.mode
+        self.attrs['data_format'] = self.data_format
+        if self.data_format == "NCHW":
+            paddings = [(0, 0), (0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3])]
+        else:
+            paddings = [(0, 0), (self.paddings[0], self.paddings[1]),
+                        (self.paddings[2], self.paddings[3]), (0, 0)]
+        if self.mode == "constant":
+            out = np.pad(self.inputs['X'],
+                         paddings,
+                         mode=self.mode,
+                         constant_values=self.pad_value)
+        else:
+            out = np.pad(self.inputs['X'], paddings, mode=self.mode)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.data_format = "NCHW"
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NCHW"
+
+
+class TestCase2(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NCHW"
+
+
+class TestCase3(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "reflect"
+        self.data_format = "NHWC"
+
+
+class TestCase4(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "edge"
+        self.data_format = "NHWC"
+
+
+class TestCase5(TestPad2dOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 4, 2)
+        self.paddings = [0, 1, 2, 3]
+        self.mode = "constant"
+        self.pad_value = 1.2
+        self.data_format = "NHWC"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a6266a7b0c9ac40eac7b2823fc7ddf38f55357a9..bddeb6617c1743de946b3c5b4b0a465d85f35ce3 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -1390,13 +1390,11 @@ class DistributeTranspiler(object):
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
                 attrs={"use_mkldnn": False})
-            # TODO(panyx0718): What if it's SELECTED_ROWS.
-            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                optimize_block.append_op(
-                    type="scale",
-                    inputs={"X": merged_var},
-                    outputs={"Out": merged_var},
-                    attrs={"scale": 1.0 / float(self.trainer_num)})
+            optimize_block.append_op(
+                type="scale",
+                inputs={"X": merged_var},
+                outputs={"Out": merged_var},
+                attrs={"scale": 1.0 / float(self.trainer_num)})
         return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,