diff --git a/Dockerfile b/Dockerfile
index 402adee2ea2822250ebc8f6229fd6a44545d58e5..634be18a51bf61e96a8bf6f263b6674a7932d6e4 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
     tar -xz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
     cp -rf /usr/local/TensorRT/lib /usr
diff --git a/README.md b/README.md
index a67cb8ad439f462c361cb6bac2449c3a4b042126..60ffbe728178705b1734e682868614025214c2a4 100644
--- a/README.md
+++ b/README.md
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85
 
 ## Installation
 
-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.
 
-- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
-- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)
 
    We appreciate your contributions!
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b520c03a836a9e3f263ba050f151877ffe0d071d..03c73786a6c31868b1893bfcb319e43e37db1a3d 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
+if (NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
     # nvcc 9 does not support -Os. Use Release flags instead
     list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 7fb67afbe15a5a019c978092d5ba3a4a0f66d996..fd9835d023c67b76579913f2ec56c2444fea8c15 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
     URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index bc36683a9facc253e7b9feb0c5a56e79491fb9b0..f61770514eb05a99c140cdb18575c89aa5235c14 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
-)
-else()
-copy(framework_lib
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)
 
 set(module "memory")
 copy(memory_lib
@@ -161,7 +158,8 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
        ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
index b782242a6632a5d42a512cf3b830d6e047c064ab..e4682ccb94e6fc60e184632dff9ee16a6bf16ec0 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
@@ -1,5 +1,5 @@
-服务器端部署 - Anakin
-#####################
+Anakin - 服务器端加速引擎
+#######################
 
 
 使用文档
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
deleted file mode 100644
index a5209e8560b31e9f0f776fba9a2b8c5bc150165c..0000000000000000000000000000000000000000
--- a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-服务器端部署 - 原生引擎
-#######################
-
-..  toctree::
-    :maxdepth: 2
-
-    build_and_install_lib_cn.rst
-    native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/index.rst b/doc/fluid/new_docs/advanced_usage/index.rst
index dea7c236619a0bdbf402f371571d947d1cdbba65..89166573eebca045e948046c69f3b7a3e0031d58 100644
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -10,7 +10,6 @@
 ..  toctree::
     :maxdepth: 2
 
-    deploy/index_native.rst
     deploy/index_anakin.rst
     deploy/index_mobile.rst
     development/contribute_to_paddle.md
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dc7c62b06287ad333dd41082e566b0553d3a5341
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
@@ -0,0 +1,8 @@
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
index 8d645718e12e4d976a8e71de105e11f495191fbf..4f20843596aa676962a36241f59560ec2a41257b 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
@@ -21,7 +21,7 @@
 图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。
 
 <p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true"  width="350" ><br/>
 图1. 通用图像分类展示
 </p>
 
@@ -30,7 +30,7 @@
 
 
 <p align="center">
-<img src="image/flowers.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
 图2. 细粒度图像分类展示
 </p>
 
@@ -38,7 +38,7 @@
 一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。
 
 <p align="center">
-<img src="image/variations.png" width="550" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
 图3. 扰动图片展示[22]
 </p>
 
@@ -61,7 +61,7 @@
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。
 
 <p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
 图4. ILSVRC图像分类Top-5错误率
 </p>
 
@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。
 
 <p align="center">
-<img src="image/lenet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
 图5. CNN网络示例[20]
 </p>
 
@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。
 
 <p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
 图6. 基于ImageNet的VGG16模型
 </p>
 
@@ -106,7 +106,7 @@ NIN模型主要有两个特点：
 Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
 
 <p align="center">
-<img src="image/inception.png" width="800" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
 图7. Inception模块
 </p>
 
@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没
 GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。
 
 <p align="center">
-<img src="image/googlenet.jpeg" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
 图8. GoogleNet[12]
 </p>
 
@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。
 
 <p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
 图9. 残差模块
 </p>
 
 图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。
 
 <p align="center">
-<img src="image/resnet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
 图10. 基于ImageNet的ResNet模型
 </p>
 
@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。
 
 <p align="center">
-<img src="image/cifar.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
 图11. CIFAR10数据集[21]
 </p>
 
@@ -377,7 +377,7 @@ test_reader = paddle.batch(
 `event_handler_plot`可以用来利用回调数据来打点画图:
 
 <p align="center">
-<img src="image/train_and_test.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
 图12. 训练结果
 </p>
 
@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
 图13是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。
 
 <p align="center">
-<img src="image/plot.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
 图13. CIFAR10数据集上VGG模型的分类错误率
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png
deleted file mode 100644
index f3c5f2f7b0c84f83382b70124dcd439586ed4eb0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
deleted file mode 100644
index ca8f858a902ea723d886d2b88c2c0a1005301c50..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
deleted file mode 100644
index 38b21f21604b1bb84fc3f6aa96bd5fce45d15a55..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
deleted file mode 100644
index 647c822e52cd55d50e5f207978f5e6ada86cf34c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
deleted file mode 100644
index 04245cef60fe7126ae4c92ba8085273965078bee..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
deleted file mode 100644
index 249dbf96df61c3352ea5bd80470f6c4a1e03ff10..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
deleted file mode 100644
index 4660ac122e9d533023a21154d35eee29e3b08d27..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
deleted file mode 100644
index 9591a0c1e8c0165c40ca560be35a7b9a91cd5027..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png
deleted file mode 100644
index 39580c20b583f2a15d17fd124a572c84e6e2db1d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
deleted file mode 100644
index 77f785e03bacd38c4c64a817874a58ff3298d2f3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png
deleted file mode 100644
index 97a1e3eee45c0db95e6a943ca3b8c0cf6c34d4b6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
deleted file mode 100644
index 57e45cc0c27dd99b9918de2ff1228bc6b65f7424..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png
deleted file mode 100644
index 147e575bf49086811c43420d5a9c8f749e2da405..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
deleted file mode 100644
index 0aeb4f254639fdbf18e916dc219ca61602596d85..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
deleted file mode 100644
index c500eb01a90190ff66150871fe83ec275e2de8d7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
deleted file mode 100644
index c6336a9a69b95dc978719ce68896e3e752e67fed..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png
deleted file mode 100644
index b4ebbbe6a50f5fd7cd0cccb52cdac5653e34654c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png
deleted file mode 100644
index 88c60fe87f802c5ce560bb15bbdbd229aeafc4e4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
deleted file mode 100644
index 6270eefcfd7071bc1643ee06567e5b81aaf4c177..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/index.rst b/doc/fluid/new_docs/beginners_guide/basics/index.rst
index e1fd226116d88fbf137741242b304b367e598ba5..0fcb008e0a7773e81e5124da09fe07366130b924 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
@@ -6,7 +6,7 @@
 ..  todo::
 
     概述
-
+    
 ..  toctree::
     :maxdepth: 2
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..29b5622a53a1b0847e9f53febf1cc50dcf4f044a
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
@@ -0,0 +1,12 @@
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
index 47e948bd1ffc0ca692dc9899193e94831ce4234b..0891f5b6b16a1b715b44db6c47ba079adfcad4c5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。
 
 <div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
 图1. 依存句法分析句法树示例
 </div>
 
@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 我们继续以上面的这句话为例，图1展示了BIO表示方法。
 
 <div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%"  align=center /><br>
 图2. BIO标注方法示例
 </div>
 
@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 图3是最终得到的栈式循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>
 
@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。
 
 <p align="center">  
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>
 
@@ -79,7 +79,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。
 
 <p align="center">  
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>
 
@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；
 
 <div  align="center">  
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
deleted file mode 100644
index e63f5ebd6d00f2e4ecf97b9ab2027e74683013f2..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
deleted file mode 100644
index f0a195c24d9ee493f96bb93c28a99e70566be7a4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
deleted file mode 100644
index e5f7151c9fcc50a7cf7af485cbbc7e4fccab0c20..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
deleted file mode 100644
index 93b44dd4874402ef29ad7bd7d94147609b92e309..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
deleted file mode 100644
index 592f7ee23bdc88a9a35059612e5ab880bbc9d34b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
deleted file mode 100644
index c3646312e48db977402fb353dc0c9b4d02269bf4..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
deleted file mode 100644
index 9265b671735940ed6549e2980064d2ce08baae64..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
deleted file mode 100644
index 23f4f45b603e3d60702af2b2464d10fc8deed061..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
deleted file mode 100644
index 0778fda74b2ad22ce4b631791a7b028cdef780a5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
deleted file mode 100644
index 3d2914c726b5f4c46e66dfa85d4e88649fede6b3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
deleted file mode 100644
index 0b944ef91e8b5ba4b14d2a35bd8879f261cf8f61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6129b9e8645010fcb8372d9dc3dbb568dfa80907
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
@@ -0,0 +1,9 @@
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
index f37c559921483a3d7c619ed74903df56b0584bd5..6e5f77fec8a894c390ced8c93ee344fd8d27370e 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
@@ -11,10 +11,10 @@
 为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。
 
 近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-![nmt](./image/nmt.png)
-<p align="center">
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
 图1. 基于神经网络的机器翻译系统
-</p>
+</div>
 
 本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。
 
@@ -45,19 +45,22 @@
 
 具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。
 
-![bi_rnn](./image/bi_rnn.png)
-<p align="center">
-图3. 按时间步展开的双向循环神经网络
-</p>
+
+<div align="center">
+<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
+图2. 按时间步展开的双向循环神经网络
+</div>
 
 ### 编码器-解码器框架
 
 编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
 ![encoder_decoder](./image/encoder_decoder.png)
-<p align="center">
-图4. 编码器-解码器框架
-</p>
+<div align="center">
+<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
+图3. 编码器-解码器框架
+</div>
 
+<a name="编码器"></a>
 #### 编码器
 
 编码阶段分为三步：
@@ -69,19 +72,17 @@
 3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。
 
 第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
-
-![encoder_attention](./image/encoder_attention.png)
-<p align="center">
-图5. 使用双向GRU的编码器
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
+图4. 使用双向GRU的编码器
+</div>
 
 #### 解码器
 
 机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
 $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
-其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
+其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用注意力机制时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
 
 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
 $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 
 机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。
 
+<a name="柱搜索算法"></a>
 ### 柱搜索算法
 
 柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
@@ -100,7 +102,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。
 
 使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。
 
 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
deleted file mode 100644
index 9d8efd50a49d0305586f550344472ab94c93bed3..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
deleted file mode 100644
index 4b35c88fc8ea2c503473c0c15711744e784d6af6..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
deleted file mode 100644
index 1b355e7786d25487a3f564af758c2c52c43b4690..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
deleted file mode 100644
index 3728f782ee09d9308d02b42305027b2735467ead..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
deleted file mode 100644
index 28d7a15a3bd65262bde22a3f41b5aa78b46b368a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
deleted file mode 100644
index ea8585565da1ecaf241654c278c6f9b15e283286..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
deleted file mode 100644
index 60aee0017de73f462e35708b1055aff8992c03e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
deleted file mode 100644
index 6b73798fe632e0873b35c117b86f347c8cf3116a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
deleted file mode 100644
index 0cde685b84106650a4df18ce335a23e6338d3d11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
deleted file mode 100644
index a6af429f23f0f7e82650139bbd8dcbef27a34abe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
deleted file mode 100644
index bf56d73ebf297fadf522389c7b6836dd379aa097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
deleted file mode 100644
index 557310e044b2b6687e5ea6895417ed946ac7bc11..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..f23901aeb3a9e7cd12611fc556742670d04a9bb5
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
@@ -0,0 +1,2 @@
+.idea
+.ipynb_checkpoints
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
index 0f7c97021f8ad463fc51ed169604b789ea068c3d..4b79e62f74e587fcd939d9f9e911af80992ea6a3 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
@@ -37,7 +37,7 @@ Prediction Score is 4.25
 YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：
 
 <p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
 图1. YouTube 推荐系统结构
 </p>
 
@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐
 首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。
 
 <p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
 图2. 候选生成网络结构
 </p>
 
@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
 卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：
 
 <p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
 图3. 卷积神经网络文本分类模型
 </p>
 
@@ -107,7 +107,7 @@ $$\hat c=max(c)$$
 
 <p align="center">
 
-<img src="image/rec_regression_network.png" width="90%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
 图4. 融合推荐模型
 </p>
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
deleted file mode 100644
index c213608e769f69fb2cfe8597f8e696ee53730e3d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
deleted file mode 100644
index 8aedb2204371e7691140ceffa5992f6080bbf097..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
deleted file mode 100644
index 4298567ac5600173343299999965b20612e7affe..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
deleted file mode 100644
index a98e7cc67606b31e4c945f7eb907563e46dcef56..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
deleted file mode 100644
index 7fd97b9cc3a0b9105b41591af4e8f8e4646bd681..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
deleted file mode 100644
index 90c9b09fb78db98391ee199934f2d16efd6d6652..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
deleted file mode 100644
index 6fc8e11967000ec48c1c0a6fa3c2eaecb80cbb84..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
deleted file mode 100644
index 61e63d9147cbc2901706ef80776d706e5368c3c5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
deleted file mode 100644
index fbcae2be81141be955076e877b94b0ea5d7e4d4a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..667762d327cb160376a4119fa9df9db41b6443b2
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
index 5844b6fe137c2401a04e47b5b489434ee9b363f1..8477cf32146c33947ced447c8bdd287a3e1e71f5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
@@ -37,7 +37,7 @@
 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。
 
 <p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
 图1. 循环神经网络按时间展开的示意图
 </p>
 
@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
 其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：
 
 <p align="center">
-<img src="image/lstm.png" width = "65%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
 图2. 时刻$t$的LSTM [7]
 </p>
 
@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
 如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。
 
 <p align="center">
-<img src="image/stacked_lstm.jpg" width=450><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
 图3. 栈式双向LSTM用于文本分类
 </p>
 
@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
 
 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。
 
+<a name="栈值双向LSTM"></a>
+
 ### 栈式双向LSTM
 
 栈式双向神经网络`stacked_lstm_net`的代码片段如下：
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
deleted file mode 100644
index 98fbea413a98a619004ca669c67f5f867fe974c9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
deleted file mode 100644
index d73a00bf2c1fca2f9b8c26bccf5ea844fa1db50b..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
deleted file mode 100644
index 26c904102a6e6c4e30f0048b81373ae8c148b355..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
deleted file mode 100644
index 6b2adf70f2b5112a2e82505da5cff9f5fd0c6298..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
deleted file mode 100644
index 8b5dbd726178b5555c513294e7b10a81acc96ff5..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..a620e0279c310d213d4e6d8e99e666962c11e352
--- /dev/null
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
@@ -0,0 +1,3 @@
+data/train.list
+data/test.list
+data/simple-examples*
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
index d21c7ddcc501f863b5ce672123dbbc6c26528f15..904d99fe2ffc9ead69a86c9763568a5c098348d5 100644
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
@@ -34,7 +34,7 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。
 
 <p align="center">
-    <img src = "image/2d_similarity.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
     图1. 词向量的二维投影
 </p>
 
@@ -50,7 +50,7 @@ similarity: -0.0997506977351
 
 ```
 
-以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[应用模型](#应用模型)中详细描述用法。
+以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[模型应用](#模型应用)中详细描述用法。
 
 
 ## 模型概览
@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。
 
 <p align="center">
-       <img src="image/nnlm.png" width=500><br/>
+       <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
        图2. N-gram神经网络模型
 </p>
 
@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：
 
 <p align="center">
-    <img src="image/cbow.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
     图3. CBOW模型
 </p>
 
@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。
 
 <p align="center">
-    <img src="image/skipgram.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
     图4. Skip-gram模型
 </p>
 
@@ -189,12 +189,13 @@ dream that one day <e>
 
 最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。
 
+<a name="训练模型"></a>
 ## 编程实现
 
 本配置的模型结构如下图所示：
 
 <p align="center">
-    <img src="image/ngram.png" width=400><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
     图5. 模型配置中的N-gram神经网络模型
 </p>
 
@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995
 ...
 ```
 
+<a name="模型应用"></a>
 ## 模型应用
 在模型训练后，我们可以用它做一些预测。
 
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
deleted file mode 100644
index 384f59919a2c8dedb198e97d51434616648932e1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
deleted file mode 100644
index 76b7d4bc0f99372465bd9aa34721513d39ad0776..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
deleted file mode 100644
index d985c393e618e9b79df05e4ff0ae57ccc93744d0..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
deleted file mode 100644
index 2e16ab2f443732b8ef5404a8e7cd2457bc5eee23..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
deleted file mode 100644
index 2449dce6a86b43b1b997ff418ed0dba56848463f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
deleted file mode 100644
index 1e0b40a8f7aefdf46d42761305511f281c08e595..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
deleted file mode 100644
index 158bd64b8f8729dea67834a8d591d21bce8b8564..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
deleted file mode 100644
index ce4a8bf4769183cbaff91793753d2350a3ce936c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
deleted file mode 100644
index a3ab385845d3dc8b5c670bae91225bc8dd47a8bb..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
deleted file mode 100644
index 3c36c6d1f66eb98ea78c0673965d02a4ee3aa288..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
index 27d25b43961ce74d73e391b735369501fb80a231..9574dbea2f9a39bb196b61bb4fd12ba7c378f75a 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-    <img src = "image/predictions.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
     图1. 预测值 V.S. 真实值
 </p>
 
@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 ### 训练过程
 
 定义好模型结构之后，我们要通过以下几个步骤进行模型训练
-
  1. 初始化参数，其中包括权重$\omega_i$和偏置$b$，对其进行初始化（如0均值，1方差）。
-
  2. 网络正向传播计算网络输出和损失函数。
-
  3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
-
  4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。
 
 ## 数据集
@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。
 
 <p align="center">
-    <img src = "image/ranges.png" width=550><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
     图2. 各维属性的取值范围
 </p>
 
@@ -199,10 +195,12 @@ step = 0
 def event_handler_plot(event):
     global step
     if isinstance(event, fluid.EndStepEvent):
-        if event.step % 10 == 0: # record the test cost every 10 seconds
+        if step % 10 == 0:   # record a train cost every 10 batches
+            plot_cost.append(train_title, step, event.metrics[0])
+
+        if step % 100 == 0:  # record a test cost every 100 batches
             test_metrics = trainer.test(
                 reader=test_reader, feed_order=feed_order)
-
             plot_cost.append(test_title, step, test_metrics[0])
             plot_cost.plot()
 
@@ -210,12 +208,13 @@ def event_handler_plot(event):
                 # If the accuracy is good enough, we can stop the training.
                 print('loss is less than 10.0, stop')
                 trainer.stop()
-
-        # We can save the trained parameters for the inferences later
-        if params_dirname is not None:
-            trainer.save_params(params_dirname)
-
         step += 1
+
+    if isinstance(event, fluid.EndEpochEvent):
+        if event.epoch % 10 == 0:
+            # We can save the trained parameters for the inferences later
+            if params_dirname is not None:
+                trainer.save_params(params_dirname)
 ```
 
 ### 开始训练
@@ -231,11 +230,10 @@ trainer.train(
     event_handler=event_handler_plot,
     feed_order=feed_order)
 ```
-
-<p align="center">
-    <img src = "image/train_and_test1.png" width=400><br/>
-    图3. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
+图3 训练结果
+</div>
 
 
 ## 预测
@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
 batch_size = 10
 test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
 test_data = test_reader().next()
-test_feat = numpy.array([data[0] for data in test_data]).astype("float32")
-test_label = numpy.array([data[1] for data in test_data]).astype("float32")
+test_x = numpy.array([data[0] for data in test_data]).astype("float32")
+test_y = numpy.array([data[1] for data in test_data]).astype("float32")
 
-results = inferencer.infer({'x': test_feat})
+results = inferencer.infer({'x': test_x})
 
 print("infer results: (House Price)")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, results[0][k]))
+for idx, val in enumerate(results[0]):
+    print("%d: %.2f" % (idx, val))
 
 print("\nground truth:")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, test_label[k]))
+for idx, val in enumerate(test_y):
+    print("%d: %.2f" % (idx, val))
 ```
 
 ## 总结
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
deleted file mode 100644
index 27e4acb1313794f52ad9ad9e874cdadd197ff41f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png
deleted file mode 100644
index f111c7cd766b7e9981513cc8c65be87dbbf3a79e..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
deleted file mode 100644
index 5325df4800985983e17476f007658d1cdb170b1c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png
deleted file mode 100644
index 6d6a079bfdcc33617f6cf36612b271b48be6304f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png
deleted file mode 100644
index bcd304a6a0baf30ecfbc43e08fc0aca179d05958..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
index 3289116991cb8ebaa4a6fb78e100ce16f633d69c..ac36c4ecf6b9b716fe5f0dbe2346e64918c22242 100644
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -6,8 +6,8 @@
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。
 
 <p align="center">
-    <img src="image/mnist_example_image.png" width="400"><br/>
-    图1. MNIST图片示例
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
+图1. MNIST图片示例
 </p>
 
 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$
 
 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy loss），公式如下：
 
-$$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
+$$  L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$
 
 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/softmax_regression.png" width=400><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
 图2. softmax回归网络结构图<br/>
 </p>
 
@@ -54,16 +54,14 @@ $$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
 Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。
 
 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
-
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-
 3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。
 
 
 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。
 
 <p align="center">
-<img src="image/mlp.png" width=500><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
 图3. 多层感知器网络结构图<br/>
 </p>
 
@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。
 
 <p align="center">
-<img src="image/cnn.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
 图4. LeNet-5卷积神经网络结构<br/>
 </p>
 
@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。
 
 <p align="center">
-<img src="image/conv_layer.png" width='750'><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
 图5. 卷积层图片<br/>
 </p>
 
@@ -98,16 +96,15 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 #### 池化层
 
 <p align="center">
-<img src="image/max_pooling.png" width="400px"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
 图6. 池化层图片<br/>
 </p>
 
 池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。
 
-更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
-
-### 常见激活函数介绍
+更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。
 
+### 常见激活函数介绍  
 - sigmoid激活函数： $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $
 
 - tanh激活函数： $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
 我们建议使用 Fluid API，因为它更容易学起来。
 
 下面是快速的 Fluid API 概述。
-
 1. `inference_program`：指定如何从数据输入中获得预测的函数。
 这是指定网络流的地方。
 
-2. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
 这是指定损失计算的地方。
 
-3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
 
-4. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
 通过 `event_handler` 回调函数，用户可以监控培训的进展。
 
-5. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
-
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
 然后，它可以推断数据和返回预测。
 
 在这个演示中，我们将深入了解它们。
@@ -240,6 +235,7 @@ def train_program():
     acc = fluid.layers.accuracy(input=predict, label=label)
     return [avg_cost, acc]
 
+
 ```
 
 #### Optimizer Function 配置
@@ -255,9 +251,9 @@ def optimizer_program():
 
 下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。
 
-下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
 
-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader 。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
 
 ```python
 train_reader = paddle.batch(
@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
 trainer = fluid.Trainer(
     train_func=train_program, place=place, optimizer_func=optimizer_program)
-
 ```
 
 #### Event Handler 配置
@@ -315,11 +310,10 @@ def event_handler(event):
 
 `event_handler_plot` 可以用来在训练过程中画图如下：
 
-
-<p align="center">
-<img src="image/train_and_test2.png" width="400"><br/>
-图7. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
+图7 训练结果
+</div>
 
 
 ```python
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
deleted file mode 100644
index 3f5cdaacdc6acce41c5c6c99649be46685cf9903..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png
deleted file mode 100644
index bc1a9a4ccf81972dc0d69cf4c808a52218e14d61..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
deleted file mode 100644
index 65bd17eacd41bbdbdb042bd1ba366eb53663b410..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png
deleted file mode 100644
index 77524754df906ab096e120bd657449f4565c3418..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
deleted file mode 100644
index c751892ba0be3ae803b5933c3f33487ecfb6fe7f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
deleted file mode 100644
index 030cd60d3b4af9aecd4941204da4ad15f6e1189f..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
deleted file mode 100644
index 90b02fa2a735cfcc9efb2de90906325dedcb358c..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png
deleted file mode 100644
index c626723512b6ee02abd55e5bab65e7629d130522..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
deleted file mode 100644
index 9f4d26cd8da32201d0a5e9c72d466301dd2b42a1..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png
deleted file mode 100644
index 1fedea6a75abbf132cbbcf8ab10ce045997d697a..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
deleted file mode 100644
index f5a478fdc24f29c17555a2f1451f3f5a079faed9..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png
deleted file mode 100644
index 7d5508a1eccfcea1925f438043ee93b57769bebf..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
deleted file mode 100644
index 4edd7cabf8a2282f6392ac1421c7ca4afb288589..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
deleted file mode 100644
index 40b98298288b9c406fce1cbca9c913753020a94d..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png
deleted file mode 100644
index 833d3c663c94dd2d57fd19686949ded37a91f541..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
deleted file mode 100644
index 47204941af7f22e68386a70a06ec4f122b83e262..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png
deleted file mode 100644
index 6fa0a951d5262effb707e3e15af8cb900e5560b8..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png and /dev/null differ
diff --git a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png
deleted file mode 100644
index 5cb87b450d0398bcfaec0e647c362052069797e7..0000000000000000000000000000000000000000
Binary files a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png and /dev/null differ
diff --git a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
index a2f30823a6fcd379f94e6e98d043b0d00681827f..84987ea5daee9abd0fe2fe71bdfde62ea3388ab5 100644
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -149,7 +149,7 @@ python setup.py bdist_wheel
 pip install --upgrade dist/visualdl-*.whl
 ```
 
-如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md)
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)
 
 
 ## SDK
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst b/doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
similarity index 100%
rename from doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/build_and_install_lib_cn.rst
diff --git a/doc/fluid/new_docs/user_guides/howto/inference/index.rst b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..45e1a2883773b92ed47ef8d51417bbdcd060b4ec
--- /dev/null
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
@@ -0,0 +1,11 @@
+############
+模型预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.rst
diff --git a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
similarity index 97%
rename from doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
rename to doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
index aa9377c112856693cda72779bd399f2415d716f0..6d6f3035c0b5c985cd39d45df9f1bcce50dcefa0 100644
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/native_infer.rst
@@ -4,12 +4,13 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
 
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
 包括
 
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``
 
+
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。
 
 下面是一些 API 概念的介绍
@@ -95,7 +96,7 @@ engine
     CHECK(predictor->Run(slots, &outputs));
     // 获取 outputs ...
 
-编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。
 
 详细代码参考
 ------------
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
index 56fa928029903f1e3bd3e8064c146797f01b2b85..cca3684b78518867eae95d82e1347b52427ddc81 100644
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
    :maxdepth: 2
 
    feeding_data
-   use_recordio_reader
 
 Python Reader
 #############
diff --git a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
deleted file mode 100644
index dfda33f1b03516fe2c704f55d095955282b19109..0000000000000000000000000000000000000000
--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ /dev/null
@@ -1,167 +0,0 @@
-.. _user_guide_use_recordio_as_train_data:
-
-############################
-使用RecordIO文件作为训练数据
-############################
-
-相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
-:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
-但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
-:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
-用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
-使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
-
-将训练数据转换成RecordIO文件格式
-################################
-
-:code:`fluid.recordio_writer` 中，每个记录都是一个
-:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
-的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
-
-用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
-:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
-:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
-:ref:`user_guide_reader` 转换成多个RecordIO文件。
-
-具体使用方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   import numpy
-
-   def reader_creator():
-       def __impl__():
-           for i in range(1000):
-               yield [
-                        numpy.random.random(size=[3,224,224], dtype="float32"),
-                        numpy.random.random(size=[1], dtype="int64")
-                     ]
-       return __impl__
-
-   img = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
-
-   BATCH_SIZE = 32
-   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
-   fluid.recordio_writer.convert_reader_to_recordio_file(
-      "train.recordio", feeder=feeder, reader_creator=reader)
-
-其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
-:ref:`_api_fluid_data_feeder_DataFeeder`
-是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
-:ref:`user_guide_reader` 。
-
-上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
-其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
-用户可以将每一个Record的样本数设置成1。并参考
-:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
-
-
-配置神经网络, 打开RecordIO文件
-##############################
-
-RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
-打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
-简单使用方法如下:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   file_obj = fluid.layers.open_files(
-     filenames=["train.recordio"],
-     shape=[[3, 224, 224], [1]],
-     lod_levels=[0, 0],
-     dtypes=["float32", "int64"],
-     pass_num=100
-   )
-
-   image, label = fluid.layers.read_file(file_obj)
-
-其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
-直到读取了 :code:`pass_num` 遍。
-
-
-
-进阶使用
-########
-
-
-使用 :code:`fluid.layers.double_buffer()`
-------------------------------------------
-
-:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
-需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
-
-.. code-block:: python
-
-   import paddle.fliud as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.double_buffer(file_obj)
-
-   image, label = fluid.layers.read_file(file_obj)
-
-双缓冲技术可以参考
-`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
-
-配置数据增强
-------------
-
-使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   preprocessor = fluid.layers.Preprocessor(reader=data_file)
-   with preprocessor.block():
-       image, label = preprocessor.inputs()
-       image = image / 2
-       label = label + 1
-       preprocessor.outputs(image, label)
-
-如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
-:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
-:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
-:code:`preprocessor.outputs()` 标记预处理后的输出。
-
-.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
-
-使用Op组batch
--------------
-
-使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.batch(file_obj, batch_size=32)
-
-   img, label = fluid.layers.read_file(file_obj)
-
-需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
-那么这几个样本直接组成一个批量数据进行训练。
-
-读入数据的shuffle
------------------
-
-使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
-
-   img, label = fliud.layers.read_file(file_obj)
-
-需要注意的是:
-
-1. :code:`shuffle` 实现方法是:
-先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
-
-2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
-足够支持缓存 :code:`buffer_size` 条数据。
diff --git a/doc/fluid/new_docs/user_guides/index.rst b/doc/fluid/new_docs/user_guides/index.rst
index 453cb71cfdf72e031ce0f0517e2db936eca38dfc..377631109d8f65c149b12cd2a0e4da920fdf4def 100644
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -15,4 +15,5 @@
     howto/training/index
     howto/debug/index
     howto/evaluation/index
+    howto/inference/index
     models/index.rst
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b6ae930b7155d15d24b287cc3eed50f2aeaa5599..70e5b97770a6c581c6a9c0145b03c42b83f14471 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -311,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -375,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5132131e55e2feee8ae88b4c65ec102fbc9c5fe1
--- /dev/null
+++ b/paddle/fluid/framework/.gitignore
@@ -0,0 +1,2 @@
+.tensor_util.cu
+.data_type_transform.cu
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0668ff43c8192f53ff7e05abaeb575e2b78b1de4..cc7938b2ac07f11ceb7f33a2e37380d1e2ed2072 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,3 +1,22 @@
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  foreach(src ${windows_symbolic_SRCS})
+  get_filename_component(src ${src} NAME_WE)
+  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+  endif()
+  add_custom_command(OUTPUT .${src}.cu 
+          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
+          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
+          COMMENT "create hidden file of ${src}.cu")
+  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
+  endforeach()
+endfunction()
+
 add_subdirectory(ir)
 if (NOT WIN32)
 add_subdirectory(details)
@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  if (WIN32)
+    windows_symbolic(tensor_util SRCS tensor_util.cu)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    add_dependencies(tensor tensor_util)
+  else()
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  endif(WIN32)
 else()
   cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
         DEPS operator op_registry device_context math_function)
 
 if(WITH_GPU)
-  nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  if (WIN32)
+      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+      add_dependencies(data_type_transform hidden_file)
+  else()
+      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  endif(WIN32)
   nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
   cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 0bfff745493d069e948e6d277ec2bbfb0673a70b..7a99169849debcbc57d6f197b36c5045b211f3ef 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
   ir::Graph &result = *graph;
 
   for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+    if (node->IsVar() && node->Var()) {
       all_vars_.emplace(node->Name(), node->Var());
     }
   }
@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
   }
 }
 
-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
-
 int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
                                            ir::Node *node) const {
   if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
   return var;
 }
 
-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
-      prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
-
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                 ir::Node *node) const {
   int op_dev_id = -1;
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7a6f238f9cf7af18cb10ea271e453fec1902c833..ac6d9c5a64cfde60f75c76dae0a30cc7d735e996 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   std::vector<std::string> FindDistTrainRecvVars(
       const std::vector<ir::Node *> &nodes) const;
 
-  void ConnectOp(ir::Graph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
-
   void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                               size_t num_places) const;
 
@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
   void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                              int dev_id) const;
 
-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
-
   int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;
 
   void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index bfc649017f19d67660bd11d590134cf56772bb27..f5235f70ad79616801110644999d511eeda33a32 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -1,20 +1,35 @@
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
+function(pass_library TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
+    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+    set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+endfunction()
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
-cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
-cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
+
+pass_library(graph_to_program_pass)
+pass_library(graph_viz_pass)
+pass_library(fc_fuse_pass)
+pass_library(attention_lstm_fuse_pass)
+pass_library(infer_clean_graph_pass)
+pass_library(fc_lstm_fuse_pass)
+pass_library(seq_concat_fc_fuse_pass)
+set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
 
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index d2d051a69a33a38535e67227d4cc62f5b35e430c..bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"
 
 namespace paddle {
 namespace framework {
@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
   auto* cell_init = graph->RetriveNode(6);
   auto* hidden_init = graph->RetriveNode(8);
 
-#define LINK_TO(node0, node1)      \
-  node0->outputs.push_back(node1); \
-  node1->inputs.push_back(node0);
-
   auto* lstm_op = graph->CreateOpNode(&op_desc);
   PrepareParameters(graph, param);
 
-  LINK_TO(X, lstm_op);
-  LINK_TO(cell_init, lstm_op);
-  LINK_TO(hidden_init, lstm_op);
-  LINK_TO(lstm_op, LSTMOUT);
+  IR_NODE_LINK_TO(X, lstm_op);
+  IR_NODE_LINK_TO(cell_init, lstm_op);
+  IR_NODE_LINK_TO(hidden_init, lstm_op);
+  IR_NODE_LINK_TO(lstm_op, LSTMOUT);
 
   GraphSafeRemoveNodes(graph, marked_nodes);
 }
@@ -216,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
 
   float* out_data = out->mutable_data<float>(platform::CPUPlace());
   std::array<const float*, 4> tensors(
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-       W_output_w0.data<float>(), W_cell_w0.data<float>()});
+      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
   std::array<const float*, 4> tensors1(
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-       W_output_w1.data<float>(), W_cell_w1.data<float>()});
+      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}});
 
   for (int row = 0; row < D; row++) {
     for (int col = 0; col < 4; col++) {
@@ -243,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                      const LoDTensor& B_output, const LoDTensor& B_cell,
                      LoDTensor* out) {
   std::array<const float*, 4> tensors(
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-       B_cell.data<float>()});
+      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()}});
 
   PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
   int D = B_forget.dims()[0];
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 513742bab69d465aac1bfb7bcef2fe89108c14a0..5a4ebd6f3de555acccd72c61bd377ffd8ce69780 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -21,74 +21,26 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-bool VarOutLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void BuildFCPattern(PDPattern* pattern) {
-  // Create Operators
-  auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
-  auto* elementwise_add_op =
-      pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
-  // Create variables
-  // w
-  auto* mul_weight_var = pattern->NewNode("mul_weight")
-                             ->AsInput()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  // x
-  auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
-                          ->AsInput()
-                          ->assert_is_op_nth_input("mul", "X", 0);
-  // intermediate variable, will be removed in the IR after fuse.
-  auto* mul_out_var = pattern->NewNode("mul_out")
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("mul")
-                          ->assert_is_op_input("elementwise_add");
-  // bias
-  auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
-                                      ->assert_is_op_input("elementwise_add")
-                                      ->AsInput();
-  // output
-  auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
-                                      ->AsOutput()
-                                      ->assert_is_op_output("elementwise_add");
-
-  mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
-  elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
-      .LinksTo({elementwise_add_out_var});
-}
-
-// Replace the node `from` in the links to `to`
-bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
-  for (auto*& n : *links) {
-    if (n == from) {
-      n = to;
-      return true;
-    }
-  }
-  return false;
-}
-
 std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     std::unique_ptr<ir::Graph> graph) const {
   PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("fc", graph.get());
+  FusePassBase::Init("fc_fuse", graph.get());
 
   std::unordered_set<Node*> nodes2delete;
 
   GraphPatternDetector gpd;
-  BuildFCPattern(gpd.mutable_pattern());
-
-#define GET_NODE(id)                                              \
-  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);           \
-  auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+  // BuildFCPattern(gpd.mutable_pattern());
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("fc_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("mul", "X");
+  patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);
+
+#define GET_NODE(id)                                                         \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
+                 "pattern has no Node called %s", #id);                      \
+  auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);
 
   int found_fc_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -98,43 +50,33 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
     // scenerio.
     // FC's fusion is simple, just op fuse, no need to process the
     // parameters.
-    GET_NODE(mul_tmp_var);             // x
-    GET_NODE(mul_weight);              // Y
-    GET_NODE(elementwise_add_tmpvar);  // bias
-    GET_NODE(elementwise_add_out);     // Out
-    GET_NODE(mul);                     // MUL op
-    GET_NODE(elementwise_add);         // ELEMENT_ADD op
-    GET_NODE(mul_out);                 // tmp
+    GET_NODE(x);                // x
+    GET_NODE(w);                // Y
+    GET_NODE(fc_bias);          // bias
+    GET_NODE(fc_out);           // Out
+    GET_NODE(mul);              // MUL op
+    GET_NODE(elementwise_add);  // ELEMENT_ADD op
+    GET_NODE(mul_out);          // tmp
 #undef GET_NODE
 
     // Create an FC Node.
     OpDesc desc;
-    std::string fc_x_in = mul_tmp_var->Name();
-    std::string fc_Y_in = mul_weight->Name();
-    std::string fc_bias_in = elementwise_add_tmpvar->Name();
-    std::string fc_out = elementwise_add_out->Name();
+    std::string fc_x_in = x->Name();
+    std::string fc_Y_in = w->Name();
+    std::string fc_bias_in = fc_bias->Name();
+    std::string fc_out_out = fc_out->Name();
     desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
     desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
     desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
-    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
     desc.SetType("fc");
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    fc_node->inputs =
-        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
-    fc_node->outputs.push_back(elementwise_add_out);
-
-    // Update link relatons
-    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
-                                elementwise_add, fc_node));
-    PADDLE_ENFORCE(
-        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});
 
-    // Drop old nodes
-    graph->RemoveNode(mul);
-    graph->RemoveNode(elementwise_add);
-    graph->RemoveNode(mul_out);  // tmp variable
+    IR_NODE_LINK_TO(x, fc_node);
+    IR_NODE_LINK_TO(w, fc_node);
+    IR_NODE_LINK_TO(fc_bias, fc_node);
+    IR_NODE_LINK_TO(fc_node, fc_out);
 
     found_fc_count++;
   };
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 5852705b6b8d1c650faeae3dc810aac65353b459..0d69dfa79aa26940f8f56f84b35ffed34f29f703 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -11,39 +11,39 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  std::unordered_set<int> fused_ops({// first lstm
-                                     13, 15, 16,
-                                     // second lstm
-                                     23, 25, 26});
-
-  pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
-                   "any_node");
+std::string GenNodeName(const std::string& prefix, const std::string& name) {
+  return prefix + "/" + name;
+}
 
-  std::unordered_set<Node*> marked_nodes;
+void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                  bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::LSTM(pattern, name_scope, fc_out);
+  // LOG(INFO) << "\n" << pattern->DotString();
+}
 
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
 
-    auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
-    marked_nodes.insert(id);
-  };
-  gpd(graph.get(), handler);
+  BuildPattern(pattern, name_scope, with_fc_bias);
 
   // Create New OpDesc
   auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
-                          int bias, int hidden, int cell, int xx) {
+                          int bias, int hidden, int cell, int xx, int fc_bias) {
 #define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
     GET_NODE(input);
     GET_NODE(weight_x);
@@ -61,61 +61,147 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
     SET_IN(WeightX, weight_x);
     SET_IN(WeightH, weight_h);
     SET_IN(Bias, bias);
-#undef GET_NODE
 #undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = name_scope + "_bias.new";
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* lstm_bias_var = scope->FindVar(bias_n->Name());
+      PADDLE_ENFORCE(lstm_bias_var);
+      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(lstm_bias_tensor.dims());
+
+      GET_NODE(fc_bias);
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
+#undef GET_NODE
 
-    VLOG(4) << "hidden_n: " << hidden_n->Name();
-    VLOG(4) << "cell: " << cell_n->Name();
-    VLOG(4) << "xx: " << xx_n->Name();
+    // Create temp variables.
+    scope->Var(name_scope + "/BatchedInput.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchCellPreAct.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedGate.new")
+        ->GetMutable<framework::LoDTensor>();
 
     op_desc.SetInput("H0", {});
     op_desc.SetInput("C0", {});
     op_desc.SetOutput("Hidden", {hidden_n->Name()});
     op_desc.SetOutput("Cell", {cell_n->Name()});
     op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
-    op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
+    op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
+    op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
+    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
     op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", false);
-    auto* op = graph->CreateOpNode(&op_desc);
+    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+#define TMP_NAME(x) "at.new.tmp." #x
+#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT
 
-#define LINK_TO(a, b)      \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    LINK_TO(input_n, op);
-    LINK_TO(weight_x_n, op);
-    LINK_TO(weight_h_n, op);
-    LINK_TO(bias_n, op);
-    LINK_TO(op, hidden_n);
-#undef LINK_TO
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+
+#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
+    TMP_NEW(BatchedCell);
+    TMP_NEW(BatchedHidden);
+    TMP_NEW(ReorderedH0);
+    TMP_NEW(ReorderedC0);
+#undef TMP_NEW
+#undef TMP_NAME
+
+    IR_NODE_LINK_TO(input_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);
+    IR_NODE_LINK_TO(op, hidden_n);
     return op;
+  };
+
+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
+
+    GET_NODE(x);
+    GET_NODE(w);
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(lstm);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+    GET_NODE(Cell);
+
+    if (with_fc_bias) {
+      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, lstm_n, elementwise_add_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
 
+    ++fusion_count;
   };
 
-  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
-  lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
+  gpd(graph, handler);
 
-  // remove all the nodes
+  return fusion_count;
+}
 
-  for (auto* node : marked_nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
 
+  AddStatis(fusion_count);
   return graph;
 }
 
@@ -123,4 +209,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle
 
+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
index 74b08ae558b12c9328db58687cd01edbc37291a8..3ee32c63a46fcc34bdccd1e14d4bbaf9668c49e9 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,20 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class FCLstmFusePass : public Pass {
+// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
+
+// Just FC without bias
+class FCLstmFusePass : public FusePassBase {
  public:
   virtual ~FCLstmFusePass() {}
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_lstm_fuse"};
+};
+
+class MulLstmFusePass : public FusePassBase {
+ public:
+  virtual ~MulLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 55e495a0ed75c3a09703438dcfe01ca8f9d36118..ae8496204d4aeb88c04154d571325d440274e821 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -167,7 +167,6 @@ class Graph {
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
-  int node_count_{0};
 };
 
 bool IsControlDepVar(const ir::Node &var);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 945ab110b148c320b6626cadaa47d483df68419e..731b89423354532f684e19305dfa87e8eb75d4b1 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -71,7 +72,10 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {
 
 void GraphPatternDetector::operator()(Graph* graph,
                                       GraphPatternDetector::handle_t handler) {
-  if (!MarkPDNodesInGraph(*graph)) return;
+  if (!MarkPDNodesInGraph(*graph)) {
+    return;
+  }
+
   auto subgraphs = DetectPatterns();
   UniquePatterns(&subgraphs);
   RemoveOverlappedMatch(&subgraphs);
@@ -81,13 +85,13 @@ void GraphPatternDetector::operator()(Graph* graph,
   LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
   int id = 0;
   for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
     handler(g, graph);
   }
 }
 
 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
-  VLOG(4) << "mark pdnodes in graph";
+  VLOG(3) << "mark pdnodes in graph";
   if (graph.Nodes().empty()) return false;
 
   for (auto& node : GraphTraits::DFS(graph)) {
@@ -106,7 +110,13 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
       return false;
     }
   }
+  for (auto& item : pdnodes2nodes_) {
+    for (auto& n : item.second) {
+      GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
+    }
+  }
   VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+
   return !pdnodes2nodes_.empty();
 }
 
@@ -272,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
   for (const auto& subgraph : *subgraphs) {
     bool valid = true;
     for (auto& item : subgraph) {
-      if (node_set.count(item.second)) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
         valid = false;
         break;
       }
@@ -328,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
 }
 
 PDNode* PDNode::assert_is_op() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
   return this;
 }
 PDNode* PDNode::assert_is_op(const std::string& op_type) {
-  asserts_.emplace_back([this, op_type](Node* x) {
+  asserts_.emplace_back([op_type](Node* x) {
     return x && x->IsOp() && x->Op()->Type() == op_type;
   });
   return this;
 }
 PDNode* PDNode::assert_is_var() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
   return this;
 }
 PDNode* PDNode::assert_var_not_persistable() {
   assert_is_var();
-  asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); });
+  asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
   return this;
 }
 PDNode* PDNode::assert_is_persistable_var() {
@@ -357,7 +367,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
   assert_is_op_input(op_type);
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->outputs) {
-      if (IsNthInput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthInput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -368,7 +380,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
     for (auto* op : x->inputs) {
-      if (IsNthOutput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthOutput(x, op, argument, nth))
+        return true;
     }
     return false;
   });
@@ -412,6 +426,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_output(const std::string& op_type,
+                                    const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_output(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   assert_is_var();
   asserts_.emplace_back([=](Node* x) {
@@ -424,6 +444,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
   });
   return this;
 }
+PDNode* PDNode::assert_is_op_input(const std::string& op_type,
+                                   const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_input(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
   assert_is_op(op_type);
   asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
@@ -439,6 +465,130 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
   return this;
 }
 
+bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Input(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Output(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Output(argument)[nth];
+}
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+}
+bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
+                     PDNode* x, bool with_bias) {
+  // Create Operators
+  PDNode* elementwise_add_op{nullptr};
+  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
+  if (with_bias) {
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
+  }
+  // Create variables
+  // w
+  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
+                             ->AsInput()
+                             ->assert_is_persistable_var()
+                             ->assert_is_op_nth_input("mul", "Y", 0);
+  PDNode* mul_out_var{nullptr};
+  if (with_bias) {
+    // intermediate variable, will be removed in the IR after fuse.
+    mul_out_var = pattern->NewNode(name_scope, "mul_out")
+                      ->AsIntermediate()
+                      ->assert_is_only_output_of_op("mul")
+                      ->assert_is_op_input("elementwise_add");
+  }
+  PDNode *bias{nullptr}, *fc_out{nullptr};
+  if (with_bias) {
+    // bias
+    bias = pattern->NewNode(name_scope, "fc_bias")
+               ->assert_is_op_input("elementwise_add")
+               ->AsInput();
+    // output
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("elementwise_add");
+  } else {
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("mul");
+  }
+
+  if (with_bias) {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
+  } else {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
+  }
+
+  return fc_out;
+}
+PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
+                       PDNode* x) {
+  x->assert_is_op_input("lstm", "Input");
+  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
+#define NEW_NODE(arg__, io__)                        \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__("lstm", #arg__);
+
+  // Currently, the H0 and C0 are optional
+  // TODO(Superjomn) upgrade the fuse framework to support optional.
+  // NEW_NODE(H0, input);
+  // NEW_NODE(C0, input);
+  NEW_NODE(Weight, input);
+  NEW_NODE(Bias, input);
+
+  NEW_NODE(Hidden, output);
+  NEW_NODE(Cell, output);
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchCellPreAct, output);
+
+  lstm_op->LinksFrom({x, Weight, Bias});
+  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
+  return Hidden;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index f8488c84962d1caa6e7817b3c0349d6da3a59182..eacea1750f6f1e86a8fe79637c3bd757a7275398 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -19,6 +19,9 @@
 #endif
 
 #include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -95,7 +98,11 @@ struct PDNode {
   PDNode* assert_var_not_persistable();
   PDNode* assert_is_persistable_var();
   PDNode* assert_is_op_output(const std::string& op_type);
+  PDNode* assert_is_op_output(const std::string& op_type,
+                              const std::string& argument);
   PDNode* assert_is_op_input(const std::string& op_type);
+  PDNode* assert_is_op_input(const std::string& op_type,
+                             const std::string& argument);
   PDNode* assert_is_op_nth_input(const std::string& op_type,
                                  const std::string& argument, int nth);
   PDNode* assert_is_op_nth_output(const std::string& op_type,
@@ -167,6 +174,9 @@ class PDPattern {
 
   PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
   PDNode* NewNode(const std::string& name = NewID());
+  PDNode* NewNode(const std::string& prefix, const std::string& name) {
+    return NewNode(prefix + "/" + name);
+  }
   PDNode* RetrieveNode(const std::string& id) const;
 
   const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
@@ -238,6 +248,8 @@ class GraphPatternDetector {
   void UniquePatterns(std::vector<subgraph_t>* subgraphs);
 
   // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PDNodes will be removed, so can't shared by multiple
+  // patterns.
   void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
 
   // Validate whether the intermediate nodes are linked by external nodes.
@@ -257,64 +269,40 @@ class GraphPatternDetector {
 
 // some helper methods.
 
-// Op's input.
-static bool VarLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Op's output.
-static bool VarLinksFromOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->inputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if a var links to an Op
+bool VarLinksToOp(Node* node, const std::string& op_type);
+
+// Tell if an op links to a var
+bool VarLinksFromOp(Node* node, const std::string& op_type);
 
 // Check whether a var node is a op node's nth input.
-static bool IsNthInput(Node* var, Node* op, const std::string& argument,
-                       size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Input(argument)[nth];
-}
-
-static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
-                        size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Output(argument)[nth];
-}
-
-static void GraphSafeRemoveNodes(Graph* graph,
-                                 const std::unordered_set<const Node*>& nodes) {
-  for (auto* node : nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);
 
-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
-}
+// Tell whether a var node is a op node's nth output.
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
+
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes);
+
+// Some pre-defined patterns those can be reused in multiple passes.
+namespace patterns {
+
+// FC with bias
+// op: mul + elementwise_add
+// named nodes:
+// mul, elementwise_add
+// w, mul_out, bias, fc_out
+PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
+           bool with_bias);
+
+PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+
+}  // namespace patterns
+
+#define IR_NODE_LINK_TO(a, b) \
+  a->outputs.push_back(b);    \
+  b->inputs.push_back(a);
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
index 7e5c86b033a7c69a306491cf4bf8d099018c5f19..6c466fb21fb46e09961dc874e9e39655f83d17c6 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
         return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
       },
       "OP0");
-  auto* any_var = x.mutable_pattern()->NewNode(
-      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_var = x.mutable_pattern()
+                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
+                      ->AsIntermediate();
   auto* any_op1 = x.mutable_pattern()->NewNode(
       [](Node* node) { return node->IsOp(); }, "OP1");
 
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index 4c7ffe69e933de3d52c8f762a1eeb73de17e0561..31ed98db72c8fd4af8c970861d386687962001ce 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
 
   Dot dot;
 
-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
-                                   Dot::Attr("shape", "box"),
-                                   Dot::Attr("fillcolor", "red")});
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
-                                    // Dot::Attr("shape", "diamond"),
-                                    Dot::Attr("fillcolor", "yellow")});
-
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
-                                          Dot::Attr("shape", "box"),
-                                          Dot::Attr("fillcolor", "lightgray")});
-  std::vector<Dot::Attr> marked_var_attrs(
-      {Dot::Attr("style", "filled,rounded"),
-       // Dot::Attr("shape", "diamond"),
-       Dot::Attr("fillcolor", "lightgray")});
+  const std::vector<Dot::Attr> op_attrs({
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("color", "#303A3A"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
+      Dot::Attr("fontname", "Arial"),             //
+  });
+  const std::vector<Dot::Attr> arg_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
 
   auto marked_nodes = ConsumeMarkedNodes(graph.get());
   // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
           marked_nodes.count(n) ? marked_op_attrs : op_attrs;
       dot.AddNode(node_id, attr, node_id);
     } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
-      dot.AddNode(node_id, attr, node_id);
+      decltype(op_attrs)* attr;
+      if (marked_nodes.count(n)) {
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+
+      dot.AddNode(node_id, *attr, node_id);
     }
     node2dot[n] = node_id;
   }
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h
index 8d885cb9e4ee6e01de386b0f22423988dbe60ca6..e64916a5bb662e3b00cfe212f0bbbc537c7bc2cc 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
   marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
 };
 
+static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
index f885567da1965b997b2063e06c839af95b43e1e1..7713ed1eab88ee4fa16d52e7425075ae66f721a3 100644
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -13,42 +13,41 @@
 // limitations under the License.
 
 #include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-class InferCleanGraphPass : public Pass {
+class InferCleanGraphPass : public FusePassBase {
  public:
   virtual ~InferCleanGraphPass() {}
 
  protected:
   std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    FusePassBase::Init("original_graph", graph.get());
     PADDLE_ENFORCE(graph.get());
 
     auto is_valid_node = [](Node* x) {
       return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
     };
 
-    std::unordered_set<Node*> invalid_nodes;
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
     for (auto* node : graph->Nodes()) {
       if (is_valid_node(node)) {
         invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
       }
     }
 
-    // remove nodes from the graph.
-    for (auto* node : invalid_nodes) {
-      graph->RemoveNode(node);
-    }
+    GraphSafeRemoveNodes(graph.get(), invalid_nodes);
 
-    // clean edges.
-    for (auto* node : graph->Nodes()) {
-      CleanEdges(&node->inputs, invalid_nodes);
-      CleanEdges(&node->outputs, invalid_nodes);
-    }
+    AddStatis(valid_op);
 
     return graph;
   }
diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
index a776a898a5ee13b4dde12460dce71433268fb9d4..e1a441d09aaa3647c4b2a582210a2c7e2b64e0da 100644
--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
     op_desc.SetAttr("fc_activation", act->Op()->Type());
 
     auto* op_node = graph->CreateOpNode(&op_desc);
-// Add links
-#define NODE_LINKS(a, b)   \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    NODE_LINKS(fc_w, op_node);
-    NODE_LINKS(fc_bias, op_node);
-    NODE_LINKS(concat_in0, op_node);
-    NODE_LINKS(sequence_expand0_in, op_node);
-    NODE_LINKS(sequence_expand1_in, op_node);
-    NODE_LINKS(op_node, fc_out);
+    // Add links
+    IR_NODE_LINK_TO(fc_w, op_node);
+    IR_NODE_LINK_TO(fc_bias, op_node);
+    IR_NODE_LINK_TO(concat_in0, op_node);
+    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
+    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
+    IR_NODE_LINK_TO(op_node, fc_out);
 
     // Clean nodes.
     std::unordered_set<const Node*> marked_nodes;
@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
     marked_nodes.erase(sequence_expand0_in);
     marked_nodes.erase(sequence_expand1_in);
     marked_nodes.erase(fc_out);
-
     GraphSafeRemoveNodes(graph, marked_nodes);
   });
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a4f6364ae5b7d832096c92e9c6d8b3e865713cff..2006e3b24f71d0ae32b4e2ae34f1a1e4d3a82f91 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -10,19 +10,19 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 
-if(NOT APPLE)
+#if(APPLE)
   add_subdirectory(api)
-endif()
+#endif()
 
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -32,6 +32,7 @@ endif()
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index d43ecc722ea3c78541835fb3f5efc9a3529fbf11..f2e18a461fd221252e4a10262a13bc8e942f5988 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -25,17 +25,16 @@ function (inference_analysis_test TARGET)
     if(WITH_TESTING)
         set(options "")
         set(oneValueArgs "")
-        set(multiValueArgs SRCS EXTRA_DEPS)
+        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
         cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
         set(mem_opt "")
         if(WITH_GPU)
             set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
         endif()
         cc_test(${TARGET}
                 SRCS "${analysis_test_SRCS}"
-                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
+                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
         set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
     endif(WITH_TESTING)
 endfunction(inference_analysis_test)
@@ -51,32 +50,19 @@ endfunction(inference_download_and_uncompress)
 set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
 set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
 set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
-if (NOT EXISTS ${DITU_INSTALL_DIR})
+if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
   inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
   inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
 endif()
 
 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
-    analysis_predictor
-		# ir
-		fc_fuse_pass
-		fc_lstm_fuse_pass
-    seq_concat_fc_fuse_pass
-		graph_viz_pass
-		infer_clean_graph_pass
-		graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass
-    paddle_inference_api
-		pass
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
-        --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
+         --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)
 
 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
@@ -88,13 +74,37 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
 set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
 set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
 set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
-if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR})
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
   inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
   inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
 endif()
 
-inference_analysis_test(test_chinese_ner SRCS chinese_ner_tester.cc
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
     EXTRA_DEPS paddle_inference_api paddle_fluid_api
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
         --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
+set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
+set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
+
+if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+endif()
+
+inference_analysis_test(test_text_classification SRCS test_text_classification.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index e6e63544ffa2de09e39b02769aaaf0793d6b1111..1fd884435d173800563ea37809003ed3aee16c7c 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
  public:
   DfgPassManagerImpl() {
     // TODO(Superjomn) set the key with pass reprs.
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
-    if (FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    } else {
+    if (!FLAGS_IA_enable_ir) {
       AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
     }
     TryAddTensorRtPass();
     AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
     if (!FLAGS_IA_output_storage_path.empty()) {
       AddPass("model-store-pass", new ModelStorePass);
     }
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
   }
 
   std::string repr() const override { return "dfg-pass-manager"; }
@@ -101,18 +98,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
 
 void Analyzer::Run(Argument* argument) {
-  // Ugly support fluid-to-ir-pass
-  argument->Set(kFluidToIrPassesAttr,
-                new std::vector<std::string>({
-                    // Manual update the passes here.
-                    "graph_viz_pass",                              //
-                    "infer_clean_graph_pass", "graph_viz_pass",    //
-                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
-                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
-                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
-                    "fc_fuse_pass", "graph_viz_pass"               //
-
-                }));
+  std::vector<std::string> passes;
+  for (auto& pass : all_ir_passes_) {
+    if (!disabled_ir_passes_.count(pass)) {
+      passes.push_back(pass);
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
+    }
+  }
+  passes.push_back("graph_viz_pass");
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));
 
   for (auto& x : data_) {
     PADDLE_ENFORCE(x->Initialize(argument));
@@ -121,6 +115,11 @@ void Analyzer::Run(Argument* argument) {
   }
 }
 
+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 2e107c82dd50d5cf22797f4c82e69d302514f955..3fdd2b9ec7537c891a04efb3ca9a1d45075ffa5e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,16 +36,10 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"
 
-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> {
 
   void Run(Argument* argument);
 
+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+
   DISABLE_COPY_AND_ASSIGN(Analyzer);
+
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+  }};
+
+  std::unordered_set<std::string> disabled_ir_passes_;
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2f7253ac04cac8457fa60a055e4ef2770aa874b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path for LAC");
+DEFINE_string(infer_data, "", "data file for LAC");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct DataRecord {
+  std::vector<int64_t> data;
+  std::vector<size_t> lod;
+  // for dataset and nextbatch
+  size_t batch_iter{0};
+  std::vector<std::vector<size_t>> batched_lods;
+  std::vector<std::vector<int64_t>> batched_datas;
+  std::vector<std::vector<int64_t>> datasets;
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+    batch_iter = 0;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    datasets.resize(0);
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      std::vector<int64_t> words_ids;
+      split_to_int64(data[1], ' ', &words_ids);
+      datasets.emplace_back(words_ids);
+    }
+  }
+  void Prepare(int bs) {
+    if (bs == 1) {
+      batched_datas = datasets;
+      for (auto one_sentence : datasets) {
+        batched_lods.push_back({0, one_sentence.size()});
+      }
+    } else {
+      std::vector<int64_t> one_batch;
+      std::vector<size_t> lod{0};
+      int bs_id = 0;
+      for (auto one_sentence : datasets) {
+        bs_id++;
+        one_batch.insert(one_batch.end(), one_sentence.begin(),
+                         one_sentence.end());
+        lod.push_back(lod.back() + one_sentence.size());
+        if (bs_id == bs) {
+          bs_id = 0;
+          batched_datas.push_back(one_batch);
+          batched_lods.push_back(lod);
+          one_batch.clear();
+          one_batch.resize(0);
+          lod.clear();
+          lod.resize(0);
+          lod.push_back(0);
+        }
+      }
+      if (one_batch.size() != 0) {
+        batched_datas.push_back(one_batch);
+        batched_lods.push_back(lod);
+      }
+    }
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    data.data = batched_datas[batch_iter];
+    data.lod = batched_lods[batch_iter];
+    batch_iter++;
+    if (batch_iter >= batched_datas.size()) {
+      batch_iter = 0;
+    }
+    return data;
+  }
+};
+void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                 int batch_size) {
+  auto one_batch = data->NextBatch();
+  PaddleTensor input_tensor;
+  input_tensor.name = "word";
+  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
+  input_tensor.lod.assign({one_batch.lod});
+  input_tensor.dtype = PaddleDType::INT64;
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  input_slots->assign({input_tensor});
+}
+static void PrintTime(const double latency, const int bs, const int repeat) {
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
+            << ", avg latency: " << latency / repeat << "ms";
+  LOG(INFO) << "=====================================";
+}
+void BenchAllData(const std::string &model_path, const std::string &data_file,
+                  const int batch_size, const int repeat) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  GetOneBatch(&input_slots, &data, batch_size);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  double sum = 0;
+  for (int i = 0; i < repeat; i++) {
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      timer.tic();
+      predictor->Run(input_slots, &outputs_slots);
+      sum += timer.toc();
+    }
+  }
+  PrintTime(sum, batch_size, repeat);
+}
+const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
+                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
+                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
+                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+void TestLACPrediction(const std::string &model_path,
+                       const std::string &data_file, const int batch_size,
+                       const int repeat, bool test_all_data) {
+  if (test_all_data) {
+    BenchAllData(model_path, data_file, batch_size, repeat);
+    return;
+  }
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < repeat; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  PrintTime(timer.toc(), batch_size, repeat);
+  EXPECT_EQ(outputs_slots.size(), 1UL);
+  auto &out = outputs_slots[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+  PADDLE_ENFORCE_GT(size, 0);
+  EXPECT_GE(size, batch1_size);
+  int64_t *pdata = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < batch1_size; ++i) {
+    EXPECT_EQ(pdata[i], lac_ref_data[i]);
+  }
+}
+TEST(Analyzer_LAC, native) {
+  LOG(INFO) << "LAC with native";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/chinese_ner_tester.cc b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/chinese_ner_tester.cc
rename to paddle/fluid/inference/analysis/analyzer_ner_tester.cc
index 9088a29d504309bc2c7b96fd49a0bf44e7cf0da9..720a8811db75a91a5774a29dd95285eceabadf83 100644
--- a/paddle/fluid/inference/analysis/chinese_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_ner_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 1a65e85dd237eb1bacd3c15b4538a9835ec4b9e0..4cf26d3c70eafd951d14c26335416ec2c71c001d 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,25 +16,27 @@
 
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"
 
 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-using namespace framework;
+using namespace framework;  // NOLINT
 
 TEST(Analyzer, analysis_without_tensorrt) {
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
@@ -219,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
 }  // namespace
 
 const float ditu_rnn_target_data[] = {
@@ -265,57 +234,97 @@ const float ditu_rnn_target_data[] = {
     10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
     93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
     169.426, 0,       0,       0,       0,       0,       0,       0};
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
-  NativeConfig config;
+void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
+                           int num_threads) {
+  AnalysisConfig config;
   config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
   config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
   config.use_gpu = false;
   config.device = 0;
   config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
 
   auto base_predictor =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
   auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
   std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
   // Prepare inputs.
   PrepareInputs(&input_slots, &data, batch_size);
   std::vector<PaddleTensor> outputs, base_outputs;
 
   base_predictor->Run(input_slots, &base_outputs);
 
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
   LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
-  LOG(INFO) << "=====================================";
-
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+  if (num_threads == 1) {
+    // Prepare inputs.
+    Timer timer;
+    timer.tic();
+    for (int i = 0; i < num_times; i++) {
+      predictor->Run(input_slots, &outputs);
+    }
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    CompareResult(outputs, base_outputs);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+    // because AttentionLSTM's hard code nodeid will be damanged.
+    for (int tid = 0; tid < num_threads; ++tid) {
+      predictors.emplace_back(
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+              config));
+    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
     }
   }
+  LOG(INFO) << "=====================================";
 
   if (use_analysis && activate_ir) {
     AnalysisPredictor *analysis_predictor =
@@ -327,39 +336,45 @@ void TestDituRNNPrediction(const std::string &model_path,
       LOG(INFO) << "fused " << item.first << " " << item.second;
     }
 
-    ASSERT_TRUE(fuse_statis.count("fc"));
-    EXPECT_EQ(fuse_statis.at("fc"), 1);
-  }
-}
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
+      }
+    }
+    LOG(INFO) << "has num ops: " << num_ops;
 
-// Directly infer with the original model.
-TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
+  }
 }
 
-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, DituRNN) {
+  TestDituRNNPrediction(true, true, FLAGS_num_threads);
 }
 
-// Inference with analysis and IR. The IR module will fuse some large kernels.
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+// Other unit-tests of DituRNN, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, DituRNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestDituRNNPrediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis
+    // module will transform the program to a data flow graph.
+    TestDituRNNPrediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestDituRNNPrediction(true, true, i);
+  }
 }
 
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 3a4ffe967e67ab0487192bbf12d4d5a15f536aa3..e8fb0775b45761f64fd6fd28306c35b76d1e40c4 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -67,7 +67,7 @@ struct Argument {
     PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
                    key);
     attrs_[key] = data;
-    attr_deleters_[key] = [data, key, this]() {
+    attr_deleters_[key] = [data, key]() {
       VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
       VLOG(3) << "argument delete attr: " << key;
       delete data;
diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..717e543f01dfa071865a5c14c0b7679e65239daf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/flags.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
index 6731b1f759363eec5dd8645783212a72ace67b2f..3086085710d6e850ed27e82d2323690dfdd3ef19 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 
@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass {
           new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
     }
 
-    const auto &ir_passes_to_apply =
-        argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
-    ir_passes.Apply(ir_passes_to_apply);
+    if (FLAGS_IA_enable_ir) {
+      const auto &ir_passes_to_apply =
+          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
+      ir_passes.Apply(ir_passes_to_apply);
+    }
 
     PADDLE_ENFORCE(argument_->main_dfg.get());
     argument_->main_dfg->Build(ir_passes.graph());
diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
index 6a13c60e7b2ebf645b12d5ddf83ef6ab3a2e83bd..367c25805d05f8d10fb8341158760ac6356a5c48 100644
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_fuse_pass);
diff --git a/paddle/fluid/inference/analysis/test_text_classification.cc b/paddle/fluid/inference/analysis/test_text_classification.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2913824f62301795aea967c22021b2af11f343c1
--- /dev/null
+++ b/paddle/fluid/inference/analysis/test_text_classification.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+DEFINE_string(infer_data, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "How many times to repeat run.");
+
+namespace paddle {
+
+template <typename T>
+std::string to_string(const std::vector<T> &vec) {
+  std::stringstream ss;
+  for (const auto &c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+
+void PrintTime(const double latency, const int bs, const int repeat) {
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
+            << ", avg latency: " << latency / repeat << "ms";
+  LOG(INFO) << "=====================================";
+}
+
+void Main(int batch_size) {
+  // Three sequence inputs.
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  int64_t data0[] = {0, 1, 2};
+  for (auto &input : input_slots) {
+    input.data.Reset(data0, sizeof(data0));
+    input.shape = std::vector<int>({3, 1});
+    // dtype --
+    input.dtype = PaddleDType::INT64;
+    // LoD --
+    input.lod = std::vector<std::vector<size_t>>({{0, 3}});
+  }
+
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+  config.ir_passes.push_back("fc_lstm_fuse_pass");
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  inference::Timer timer;
+  double sum = 0;
+  std::vector<PaddleTensor> output_slots;
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    CHECK(predictor->Run(input_slots, &output_slots));
+    sum += timer.toc();
+  }
+  PrintTime(sum, batch_size, FLAGS_repeat);
+
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(seq_concat_fc_fuse_pass);
+USE_PASS(fc_lstm_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
+USE_PASS(attention_lstm_fuse_pass);
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index adfe4392448557a30cd834022b9a5d21d9086b95..ea00bf364951b0a4304b380df492d00e84451136 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,7 @@ if(APPLE)
 endif(APPLE)
 
 
-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
-  graph_viz_pass fc_fuse_pass
-  infer_clean_graph_pass
-  )
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
+          analysis
+          ir_pass_manager
+          pass
+          fc_fuse_pass
+          fc_lstm_fuse_pass
+          seq_concat_fc_fuse_pass
+          graph_viz_pass
+          infer_clean_graph_pass
+          graph_pattern_detector
+          infer_clean_graph_pass
+          attention_lstm_fuse_pass
+  )
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 33862232bdaae817b9ca72879605386c32ed3e8b..79eeea88ea83ad862b5e2ac1390dae377b676685 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,10 +14,13 @@
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
@@ -27,10 +30,11 @@ bool AnalysisPredictor::Init(
   VLOG(3) << "Predictor::init()";
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
   } else {
     place_ = paddle::platform::CPUPlace();
   }
-  PADDLE_ENFORCE(!parent_scope);
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
@@ -72,7 +76,7 @@ bool AnalysisPredictor::Init(
 
 void AnalysisPredictor::OptimizeInferenceProgram() {
   LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
   FLAGS_IA_enable_tensorrt_subgraph_engine = false;
   FLAGS_IA_output_storage_path = "";  // Don't output the model.
   // Analyze inference_program
@@ -89,24 +93,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
   argument_.origin_program_desc.reset(
       new ProgramDesc(*inference_program_->Proto()));
-  Analyzer().Run(&argument_);
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
   CHECK(argument_.transformed_program_desc);
   VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
   inference_program_.reset(
       new framework::ProgramDesc(*argument_.transformed_program_desc));
-  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
-  // Update scope.
-  scope_.reset(
-      argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  LOG(INFO) << "optimize end ==";
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
+    // Update scope.
+    scope_.reset(
+        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
+  }
+  LOG(INFO) << "== optimize end ==";
 }
 
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+  VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu) {
     // 1. GPU memeroy
     PADDLE_ENFORCE_GT(
@@ -133,7 +139,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 }
 
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e32b6185f6044ab3577bde0a8f8dcf2391688aa8..e53925366e9214cd60422efe56884751297c15e5 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
  */
 class AnalysisPredictor : public NativePaddlePredictor {
  public:
-  explicit AnalysisPredictor(const NativeConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig& config)
       : NativePaddlePredictor(config), config_(config) {}
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
   Argument& analysis_argument() { return argument_; }
 
  private:
-  NativeConfig config_;
+  AnalysisConfig config_;
   Argument argument_;
 };
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 530274f0c9262b6ed0e43766606585c8459eabb9..bd9b4b1a814f995e3979105f5b9830b95fd8ea7d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/time.h>
 #include <algorithm>
 #include <map>
 #include <set>
@@ -23,32 +22,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(profile, false, "Turn on profiler for fluid");
 
 namespace paddle {
 namespace {
-
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
+using paddle::inference::Timer;
 
 template <class T>
 std::string num2str(T a) {
@@ -62,14 +43,14 @@ void NativePaddlePredictor::PrepareFeedFetch() {
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (feeds_.size() <= (size_t)idx) {
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
         feeds_.resize(idx + 1);
       }
       feeds_[idx] = op;
       feed_names_[op->Output("Out")[0]] = idx;
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= (size_t)idx) {
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
         fetchs_.resize(idx + 1);
       }
       fetchs_[idx] = op;
@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
     std::shared_ptr<framework::Scope> parent_scope) {
   VLOG(3) << "Predictor::init()";
-
+#if !defined(_WIN32)
   if (FLAGS_profile) {
     LOG(WARNING) << "Profiler is actived, might affect the performance";
     LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init(
                                            : platform::ProfilerState::kCPU;
     platform::EnableProfiler(tracking_device);
   }
+#endif
 
   if (config_.use_gpu) {
     place_ = paddle::platform::CUDAPlace(config_.device);
@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init(
 }
 
 NativePaddlePredictor::~NativePaddlePredictor() {
+#if !defined(_WIN32)
   if (FLAGS_profile) {
     platform::DisableProfiler(platform::EventSortingKey::kTotal,
                               "./profile.log");
   }
+#endif
   if (sub_scope_) {
     scope_->DeleteScope(sub_scope_);
   }
@@ -179,15 +163,21 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return cls;
+#else
   // fix manylinux compile error.
   return std::move(cls);
+#endif
 }
 
 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                     framework::Scope *scope) {
   VLOG(3) << "Predictor::set_feed";
   if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
     return false;
   }
   for (size_t i = 0; i < inputs.size(); ++i) {
@@ -329,7 +319,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
+#ifdef __clang__
+  // fix clang compile error
+  return predictor;
+#else
   return std::move(predictor);
+#endif
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index a697218377e1e661dccc8d8a4c78f15b5c211243..afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0)
 project(cpp_inference_demo CXX C)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WIN32)
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+else()
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+endif()
 
 if(NOT DEFINED PADDLE_LIB)
   message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
@@ -32,44 +37,56 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
 
+if (NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
 link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+endif(NOT WIN32)
+
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+link_directories("${PADDLE_LIB}/paddle/fluid/inference")
 
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 
 if(WITH_MKL)
   include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
   set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
   if(EXISTS ${MKLDNN_PATH})
     include_directories("${MKLDNN_PATH}/include")
     set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
   endif()
 else()
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
 endif()
 
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 
+if (NOT WIN32)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
     glog gflags protobuf snappystream snappy z
     ${EXTERNAL_LIB})
+else()
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+    ${EXTERNAL_LIB})
+endif(NOT WIN32)
+
 if(WITH_GPU)
-  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
 
 target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 7824ef2649af81a2390ff3bc537eb7c93c70e402..0f7d541c5edfc62e80cf50f83b491f06dcb42644 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -14,7 +14,7 @@ else
 fi
 
 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 8eac449a1081330877eac6d4f40c064533b0acab..2c2ac656e8005369bb0e9033236a431cb09caa15 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,36 +14,19 @@
 
 #pragma once
 
+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
 
 namespace paddle {
 namespace inference {
 
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -106,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
   }
 }
 
+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms";
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 1baa64c249f291ec1bc874be5031abe6d4368274..995da11e4a30eca72a91a53d3293aa8b033b012b 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
   int workspace_size{1 << 30};
 };
 
+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:
diff --git a/paddle/fluid/inference/api/timer.h b/paddle/fluid/inference/api/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2df5274dc1f2e7ad8e434f1da9d5ae6aee94c784
--- /dev/null
+++ b/paddle/fluid/inference/api/timer.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..7e5cae04b81e6ce759b92f6c4b921ecf974e8260 100644
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -1,6 +1,7 @@
 {
 	global:
 		*paddle*;
+                *Pass*;
 	local:
 		*;
 };
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2f8fadcefe5d5b6f428092914c7e999ec7524862..7ec1e78da4ec642cb1e6248edfbcfed748fa11b8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -178,6 +178,8 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(relu);\n")
       elseif(${TARGET} STREQUAL "fake_dequantize")
         file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
       elseif(${TARGET} STREQUAL "tensorrt_engine_op")
           message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
       elseif(${TARGET} STREQUAL "fc")
@@ -293,6 +295,7 @@ op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
+op_library(fake_quantize_op DEPS memory)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 912415192659dc004f54a76e9cd1a20581d512a6..2e31d1c9c708225135e27c93ba94722794c4b282 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
+    dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
 };
 
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index a02128c5a54c80ca7ccf9db347cd53f28bbb50f8..39b0c856996c11c6efdb530f1396afd5731c778d 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/attention_lstm_op.h"
-#include <sys/time.h>
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 5edecd18e673da326ec119cf9a383f24f8045089..dfaa7456f917c1308984b361afed752f96ea6f59 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
-#include <string>
 
 namespace paddle {
 namespace operators {
@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(predict_height, label_height,
                       "Out and Label should have same height.");
 
-    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
 
     ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("TPOut", {num_thres});
-    ctx->SetOutputDim("TNOut", {num_thres});
-    ctx->SetOutputDim("FPOut", {num_thres});
-    ctx->SetOutputDim("FNOut", {num_thres});
-
-    ctx->ShareLoD("Predict", /*->*/ "AUC");
+    ctx->SetOutputDim("BatchAUC", {1});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
   }
 
  protected:
@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Label",
              "A 2D int tensor indicating the label of the training data. "
              "shape: [batch_size, 1]");
-    AddInput("TP", "True-Positive value.");
-    AddInput("FP", "False-Positive value.");
-    AddInput("TN", "True-Negative value.");
-    AddInput("FN", "False-Negative value.");
     // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
+
     AddOutput("AUC",
               "A scalar representing the "
               "current area-under-the-curve.");
-    AddOutput("TPOut", "True-Positive value.");
-    AddOutput("FPOut", "False-Positive value.");
-    AddOutput("TNOut", "True-Negative value.");
-    AddOutput("FNOut", "False-Negative value.");
+    AddOutput("BatchAUC", "The AUC for current batch");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
+    AddOutput("StatNegOut", "Statistic value when label = 0");
 
     AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
         .SetDefault("ROC");
+
     AddAttr<int>("num_thresholds",
                  "The number of thresholds to use when discretizing the"
                  " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);
 
     AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index 0a18585edb54a76aff5ae72ecc71e0eebb9f9361..fb0517d70635e090f8c5b59ff9d8420fc34c747b 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -23,106 +23,85 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predict");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *predict = ctx.Input<Tensor>("Predict");
+    auto *label = ctx.Input<Tensor>("Label");
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    int num_pred_buckets = num_thresholds + 1;
+
     // Only use output var for now, make sure it's persistable and
     // not cleaned up for each batch.
-    auto* true_positive = ctx.Output<Tensor>("TPOut");
-    auto* false_positive = ctx.Output<Tensor>("FPOut");
-    auto* true_negative = ctx.Output<Tensor>("TNOut");
-    auto* false_negative = ctx.Output<Tensor>("FNOut");
+    auto *auc = ctx.Output<Tensor>("AUC");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");
 
-    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);
 
-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<double> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
-    }
-    const double kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+            num_thresholds, batch_auc);
+  }
 
+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
     size_t batch_size = predict->dims()[0];
     size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();
+
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());
 
-    const T* inference_data = predict->data<T>();
-    const auto* label_data = label->data<int64_t>();
-
-    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
-
-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // calculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
-      for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels > 0 will be treated as true.
-        if (label_data[i]) {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
-          } else {
-            tn++;
-          }
-        }
+    for (size_t i = 0; i < batch_size; i++) {
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
+      if (label_data[i]) {
+        stat_pos[binIdx] += 1.0;
+      } else {
+        stat_neg[binIdx] += 1.0;
       }
-      // store rates
-      tp_data[idx_thresh] += tp;
-      fn_data[idx_thresh] += fn;
-      tn_data[idx_thresh] += tn;
-      fp_data[idx_thresh] += fp;
     }
-    // epsilon to avoid divide by zero.
-    double epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] =
-          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                         (tp_data[i] + fp_data[i] + epsilon);
+
+    *auc = 0.0f;
+
+    double totPos = 0.0;
+    double totNeg = 0.0;
+    double totPosPrev = 0.0;
+    double totNegPrev = 0.0;
+
+    int idx = num_thresholds;
+
+    while (idx >= 0) {
+      totPosPrev = totPos;
+      totNegPrev = totNeg;
+      totPos += stat_pos[idx];
+      totNeg += stat_neg[idx];
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+
+      --idx;
     }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
-      }
+
+    if (totPos > 0.0 && totNeg > 0.0) {
+      *auc = *auc / totPos / totNeg;
     }
   }
 };
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dee1781623d5a62830545c0952e5aadbe37accb
--- /dev/null
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * transform that computes target bounding-box regression deltas
+ * given proposal boxes and ground-truth boxes.
+ */
+template <typename T>
+inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
+                       const framework::Tensor& gt_boxes, const T* weights,
+                       const bool normalized, framework::Tensor* box_delta) {
+  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
+  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
+  auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
+  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
+  for (int64_t i = 0; i < box_num; ++i) {
+    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
+    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
+    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
+    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
+
+    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
+    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
+    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
+    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
+
+    trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
+    trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
+    trg(i, 2) = std::log(gt_w / ex_w);
+    trg(i, 3) = std::log(gt_h / ex_h);
+
+    if (weights) {
+      trg(i, 0) = trg(i, 0) / weights[0];
+      trg(i, 1) = trg(i, 1) / weights[1];
+      trg(i, 2) = trg(i, 2) / weights[2];
+      trg(i, 3) = trg(i, 3) / weights[3];
+    }
+  }
+}
+
+template <typename T>
+void Gather(const T* in, const int in_stride, const int* index, const int num,
+            T* out) {
+  const int stride_bytes = in_stride * sizeof(T);
+  for (int i = 0; i < num; ++i) {
+    int id = index[i];
+    memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 0571c46f6be99c9a06b7dd2abb310eeda506ecd5..be06dc19743cfa6f093bcb3f4e9f91af315d4211 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
   }
 }
 
-template <typename T>
-void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
-                const std::vector<float>& weights, Tensor* box_delta) {
-  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
-  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
-  auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
-  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
-  for (int64_t i = 0; i < box_num; ++i) {
-    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
-    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
-    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
-    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
-
-    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
-    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
-    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
-    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
-
-    box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
-    box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
-    box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
-    box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
-  }
-}
-
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(
     const platform::CPUDeviceContext& context, Tensor* iou,
@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
                        Tensor* sampled_labels, Tensor* sampled_gts) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
-  int gt_num = fg_num + bg_num;
   Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
   int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
   int* gt_box_inds_data =
-      gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace());
+      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   int* gt_label_inds_data =
       gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
   std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(
 
   // Gather boxes and labels
   Tensor sampled_boxes, sampled_labels, sampled_gts;
-  int boxes_num = fg_inds.size() + bg_inds.size();
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  int boxes_num = fg_num + bg_num;
   framework::DDim bbox_dim({boxes_num, kBoxDim});
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
-  sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace());
+  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
   GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
                        gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
   Tensor bbox_targets_single;
   bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights,
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
                 &bbox_targets_single);
 
   // Scale rois
@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     auto rpn_rois_lod = rpn_rois->lod().back();
     auto gt_classes_lod = gt_classes->lod().back();
     auto gt_boxes_lod = gt_boxes->lod().back();
-    for (size_t i = 0; i < n; ++i) {
+    for (int i = 0; i < n; ++i) {
       Tensor rpn_rois_slice =
           rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
       Tensor gt_classes_slice =
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index fcdcafae7273afa6887ee531dfc37ef833b92d68..ebe6830eccd87a156768eb0d4b96220bcc9f4edc 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
                               context.GetPlace());
-    rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1},
-                                   context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
 
     Tensor bbox_deltas_swap, scores_swap;
     bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     CPUGather<T>(ctx, proposals, keep, &bbox_sel);
     CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
-      return std::make_pair(bbox_sel, scores_sel);
+      return std::make_pair(bbox_sel, scores_filter);
     }
 
     Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 177ff7cf187bc9daf69889e99ca57ae18766de90..88757f25cd9a5789758640de2d9cae0b12350b25 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("DistMat");
     PADDLE_ENFORCE_EQ(in_dims.size(), 2,
                       "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("LocationIndex", {-1});
+    ctx->SetOutputDim("ScoreIndex", {-1});
+    ctx->SetOutputDim("TargetLabel", {-1, 1});
+    ctx->SetOutputDim("TargetBBox", {-1, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
   }
 };
 
 template <typename T>
 class RpnTargetAssignKernel : public framework::OpKernel<T> {
  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor_t = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_bbox_t = context.Input<Tensor>("GtBox");
+    auto* dist_t = context.Input<LoDTensor>("DistMat");
+
+    auto* loc_index_t = context.Output<Tensor>("LocationIndex");
+    auto* score_index_t = context.Output<Tensor>("ScoreIndex");
+    auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
+    auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
+
+    auto lod = dist_t->lod().back();
+    int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
+    int64_t anchor_num = dist_t->dims()[1];
+    PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
+
+    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
+    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
+    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+
+    int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
+
+    int64_t max_num = batch_num * anchor_num;
+    auto place = context.GetPlace();
+
+    tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
+    auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
+    auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
+
+    Tensor tmp_tgt_lbl;
+    auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
+    iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed =
+        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    engine.seed(seed);
+
+    int fg_num = 0;
+    int bg_num = 0;
+    for (int i = 0; i < batch_num; ++i) {
+      Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
+      Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
+      auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
+                                   rpn_batch_size, fg_num_per_batch, engine,
+                                   tmp_lbl_data + i * anchor_num);
+
+      int cur_fg_num = fg_bg_gt[0].size();
+      int cur_bg_num = fg_bg_gt[1].size();
+      std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+      memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
+      std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
+                     score_index + cur_fg_num,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+
+      // get target bbox deltas
+      if (cur_fg_num) {
+        Tensor fg_gt;
+        T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
+        Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
+        T* tgt_data = tgt_bbox.data<T>();
+        Gather<T>(anchor_t->data<T>(), 4,
+                  reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
+                  tgt_data);
+        Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
+                  cur_fg_num, gt_data);
+        BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
+      }
+
+      loc_index += cur_fg_num;
+      score_index += cur_fg_num + cur_bg_num;
+      fg_num += cur_fg_num;
+      bg_num += cur_bg_num;
+    }
+
+    int lbl_num = fg_num + bg_num;
+    PADDLE_ENFORCE_LE(fg_num, max_num);
+    PADDLE_ENFORCE_LE(lbl_num, max_num);
+
+    tgt_bbox_t->Resize({fg_num, 4});
+    loc_index_t->Resize({fg_num});
+    score_index_t->Resize({lbl_num});
+    auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
+    Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
+                    lbl_data);
+  }
+
+ private:
   void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
                    const int row, const int col, const float pos_threshold,
-                   const float neg_threshold, int64_t* target_label_data,
+                   const float neg_threshold, int64_t* target_label,
                    std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    float epsilon = 0.0001;
     for (int64_t i = 0; i < row; ++i) {
       const T* v = dist_data + i * col;
-      T max_dist = *std::max_element(v, v + col);
+      T max = *std::max_element(v, v + col);
       for (int64_t j = 0; j < col; ++j) {
-        T val = dist_data[i * col + j];
-        if (val == max_dist) target_label_data[j] = 1;
+        if (std::abs(max - v[j]) < epsilon) {
+          target_label[j] = 1;
+        }
       }
     }
 
-    // Pick the fg/bg and count the number
+    // Pick the fg/bg
+    const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
     for (int64_t j = 0; j < col; ++j) {
-      if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
-        target_label_data[j] = 1;
-      } else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
-        target_label_data[j] = 0;
+      if (anchor_to_gt_max_data[j] >= pos_threshold) {
+        target_label[j] = 1;
+      } else if (anchor_to_gt_max_data[j] < neg_threshold) {
+        target_label[j] = 0;
       }
-      if (target_label_data[j] == 1) {
-        fg_inds->push_back(fg_offset + j);
-      } else if (target_label_data[j] == 0) {
-        bg_inds->push_back(bg_offset + j);
+      if (target_label[j] == 1) {
+        fg_inds->push_back(j);
+      } else if (target_label[j] == 0) {
+        bg_inds->push_back(j);
       }
     }
   }
 
-  void ReservoirSampling(const int num, const int offset,
-                         std::minstd_rand engine,
+  void ReservoirSampling(const int num, std::minstd_rand engine,
                          std::vector<int>* inds) const {
     std::uniform_real_distribution<float> uniform(0, 1);
-    const int64_t size = static_cast<int64_t>(inds->size() - offset);
-    if (size > num) {
-      for (int64_t i = num; i < size; ++i) {
+    size_t len = inds->size();
+    if (len > static_cast<size_t>(num)) {
+      for (size_t i = num; i < len; ++i) {
         int rng_ind = std::floor(uniform(engine) * i);
         if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind + offset,
-                         inds->begin() + i + offset);
+          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
       }
+      inds->resize(num);
     }
   }
 
-  void RpnTargetAssign(const framework::ExecutionContext& ctx,
-                       const Tensor& dist, const float pos_threshold,
-                       const float neg_threshold, const int rpn_batch_size,
-                       const int fg_num, std::minstd_rand engine,
-                       std::vector<int>* fg_inds, std::vector<int>* bg_inds,
-                       int64_t* target_label_data) const {
+  // std::vector<std::vector<int>> RpnTargetAssign(
+  std::vector<std::vector<int>> SampleFgBgGt(
+      const platform::CPUDeviceContext& ctx, const Tensor& dist,
+      const float pos_threshold, const float neg_threshold,
+      const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
+      int64_t* target_label) const {
     auto* dist_data = dist.data<T>();
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    int row = dist.dims()[0];
+    int col = dist.dims()[1];
+
+    std::vector<int> fg_inds;
+    std::vector<int> bg_inds;
+    std::vector<int> gt_inds;
 
     // Calculate the max IoU between anchors and gt boxes
-    Tensor anchor_to_gt_max;
-    anchor_to_gt_max.mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(col), 1}),
-        platform::CPUPlace());
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    auto x = EigenMatrix<T>::From(dist);
-    auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
-    x_col_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(0))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
+    // Map from anchor to gt box that has highest overlap
+    auto place = ctx.GetPlace();
+    Tensor anchor_to_gt_max, anchor_to_gt_argmax;
+    anchor_to_gt_max.mutable_data<T>({col}, place);
+    int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
+
+    auto x = framework::EigenMatrix<T>::From(dist);
+    auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+    auto x_col_argmax =
+        framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
+    x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
+    x_col_argmax = x.argmax(0).template cast<int>();
+
     // Follow the Faster RCNN's implementation
     ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
-                neg_threshold, target_label_data, fg_inds, bg_inds);
+                neg_threshold, target_label, &fg_inds, &bg_inds);
     // Reservoir Sampling
-    ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
-    int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset);
-    ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
-  }
+    ReservoirSampling(fg_num, engine, &fg_inds);
+    int fg_num2 = static_cast<int>(fg_inds.size());
+    int bg_num = rpn_batch_size - fg_num2;
+    ReservoirSampling(bg_num, engine, &bg_inds);
 
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist = context.Input<LoDTensor>("DistMat");
-    auto* loc_index = context.Output<Tensor>("LocationIndex");
-    auto* score_index = context.Output<Tensor>("ScoreIndex");
-    auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
-
-    auto col = dist->dims()[1];
-    int64_t n = dist->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist->lod().back().size() - 1);
-    if (dist->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
+    gt_inds.reserve(fg_num2);
+    for (int i = 0; i < fg_num2; ++i) {
+      gt_inds.emplace_back(argmax[fg_inds[i]]);
     }
-    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
-    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
-    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-
-    int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
-
-    int64_t* target_label_data =
-        tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
+    std::vector<std::vector<int>> fg_bg_gt;
+    fg_bg_gt.emplace_back(fg_inds);
+    fg_bg_gt.emplace_back(bg_inds);
+    fg_bg_gt.emplace_back(gt_inds);
 
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
-    iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
-
-    std::vector<int> fg_inds;
-    std::vector<int> bg_inds;
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-    engine.seed(seed);
-
-    if (n == 1) {
-      RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
-                      rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                      target_label_data);
-    } else {
-      auto lod = dist->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
-        RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
-                        rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                        target_label_data + i * col);
-      }
-    }
-    int* loc_index_data = loc_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size())}, context.GetPlace());
-    int* score_index_data = score_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size() + bg_inds.size())},
-        context.GetPlace());
-    memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data + fg_inds.size(),
-           reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
+    return fg_bg_gt;
   }
 };
 
 class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddInput("Anchor",
+             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
+    AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
     AddInput(
         "DistMat",
         "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "ScoreIndex",
         "(Tensor), The indexes of foreground and background anchors in all "
         "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B depend on the value of input "
-        "tensor and attributes.");
-    AddOutput("TargetLabel",
-              "(Tensor<int64_t>), The target labels of each anchor with shape "
-              "[K * M, 1], "
-              "K and M is the same as they are in DistMat.");
+        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
+        " number.");
+    AddOutput("TargetBBox",
+              "(Tensor<int64_t>), The target bbox deltas with shape "
+              "[F, 4], F is the sampled foreground number.");
+    AddOutput(
+        "TargetLabel",
+        "(Tensor<int64_t>), The target labels of each anchor with shape "
+        "[F + B, 1], F and B are sampled foreground and backgroud number.");
     AddComment(R"DOC(
 This operator can be, for given the IoU between the ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 66784f0b5149a7c479a90a407709d993f4a40a8b..31159a02592a2aff75f7ecf5be924989f0f47071 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -39,19 +39,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                 const std::string& out_var_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
-  // Async
-  if (!sync_mode_) {
-    rpc_server_->Profiler().OneStep();
-    try {
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
-                                    scope);
-    } catch (std::exception& e) {
-      LOG(ERROR) << "async: run sub program error " << e.what();
-      return false;
-    }
-    return true;
-  }
-
   // Sync
   if (varname == BATCH_BARRIER_MESSAGE) {
     VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
@@ -60,17 +47,31 @@ bool RequestSendHandler::Handle(const std::string& varname,
     VLOG(3) << "sync: recv complete message";
     rpc_server_->Complete();
   } else {
-    VLOG(3) << "sync: received var_name: " << varname;
-    rpc_server_->WaitCond(kRequestSend);
-    VLOG(3) << "sync: processing received var: " << varname;
-
-    if (invar == nullptr) {
-      LOG(FATAL) << "sync: Can not find server side var: " << varname;
-      return false;
-    }
-    if (invar->IsType<framework::SelectedRows>()) {
-      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
-      sparse_vars_.push_back(invar);
+    // Async
+    if (!sync_mode_) {
+      VLOG(3) << "async process var: " << varname;
+      rpc_server_->Profiler().OneStep();
+      try {
+        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                      scope);
+      } catch (std::exception& e) {
+        LOG(ERROR) << "async: run sub program error " << e.what();
+        return false;
+      }
+      return true;
+    } else {  // sync
+      rpc_server_->WaitCond(kRequestSend);
+      VLOG(3) << "sync: processing received var: " << varname;
+
+      if (invar == nullptr) {
+        LOG(FATAL) << "sync: Can not find server side var: " << varname;
+        return false;
+      }
+
+      if (invar->IsType<framework::SelectedRows>()) {
+        std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+        sparse_vars_.push_back(invar);
+      }
     }
   }
   return true;
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index d5b9b2dac085e7abc31ef243be82eaa815d387ba..b1a399c22c2b9ed7464a1b1764478803d4416d94 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <algorithm>
+#include <iterator>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
 class MidWiseTransformIterator;
 
+// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
 template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+class RowwiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
   RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
 
@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
 };
 
 template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
  public:
   MidWiseTransformIterator(const T *ptr, int n, int post)
       : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
     const framework::Tensor &dout, int axis, framework::Tensor *dx,
     framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
   size_t N = static_cast<size_t>(framework::product(x_dim));
+#if !defined(_WIN32)
   platform::ForRange<DeviceContext> for_range(
       ctx.template device_context<DeviceContext>(), N);
+#else
+  platform::ForRange<DeviceContext> for_range(
+      ctx.device_context<DeviceContext>(), N);
+#endif  // !_WIN32
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
       x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
       dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,
 
 template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
+
 void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *x,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
       x, y, z, ctx.template device_context<DeviceContext>(), func);
-
   auto x_dims = x->dims();
   auto y_dims_untrimed = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index a91e0f520e93c01bc5af09b691af2d5a6deda9f2..e608eba05d5680254835f7b25f53d6a59e310e2a 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -14,86 +14,198 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
-class FakeQuantizeOp : public framework::OperatorWithKernel {
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVectorArrayMap =
+    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ConstEigenVectorArrayMap =
+    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T>
+struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
+    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
+    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
+
+    out_e = in_e.abs().maximum();
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    T s = scale.data<T>()[0];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
+          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
+    auto in_e = framework::EigenVector<T>::Flatten(in);
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+    int64_t it = iter.data<int64_t>()[0];
+    int idx = it % window_size;
+    T removed = scale_arr[idx];
+    T cur = cur_scale.data<T>()[0];
+    scale_arr[idx] = cur;
+
+    T max = last_scale.data<T>()[0];
+    if (max < cur) {
+      max = cur;
+    } else if (fabs(removed - max) < 1e-6) {
+      int size = (it > window_size) ? window_size : it;
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
+                                                         &max);
+    }
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
+  }
+};
+
+template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantizeOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
+  FakeQuantizeAbsMaxOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FakeQuantizeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FakeQuantizeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
-                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
-    // if (ctx->HasInput("InMovingScale")) {
-    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
-    //}
-    // if (ctx->HasInput("InScales")) {
-    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
-                   "OutScales(Out) of FakeQuantizeOp should not be null");
-    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
-    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
-    // ctx->Outputs("OutScales")[0],
-    //                  "Mean and MeanOut should share the same memory");
-    //}
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(Scale) of FakeQuantizeOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
-class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddInput("InMovingScale", "Last scale, used in static quantization.")
-        .AsDispensable();
-    AddInput("InCurrentIter",
-             "Last iteration number, used in static quantization.")
-        .AsDispensable();
-    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
-    AddOutput("OutScales",
-              "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddOutput("OutMovingScale", " Current scale");
-    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
-    AddAttr<std::string>("quantize_type",
-                         "(string, default abs_max)"
-                         "The scaling tpe of the quantize operator.")
-        .SetDefault("abs_max");
-    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current scale");
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
-        .AddCustomChecker([](const int &bit_length) {
+        .AddCustomChecker([](const int& bit_length) {
           PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                          "'bit_length' should be between 1 and 16.");
         });
-    AddAttr<bool>("is_test", "").SetDefault(false);
     AddComment(R"DOC(
 FakeQuantize operator
 
-quantize_type = abs_max:
+$$scale = max(abs(X))$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
 
-    $$scale = max(abs(x))$$ 
+)DOC");
+  }
+};
 
-quantize_type = range_abs_max:
+class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeRangeAbsMaxOp(const std::string& type,
+                            const framework::VariableNameMap& inputs,
+                            const framework::VariableNameMap& outputs,
+                            const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
-    $$scale = max(max(abs(x)), history_abs_max)$$ 
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScale"),
+        "Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
+    if (ctx->HasOutput("OutScales")) {
+      int window_size = ctx->Attrs().Get<int>("window_size");
+      ctx->SetOutputDim("OutScales", {window_size});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
 
-quantize_type = moving_average_abs_max:
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
 
-    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+class FakeQuantizeRangeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("Iter", "Global step iteration.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
+    AddAttr<int>("window_size", "(int, default 10000) window range size.")
+        .SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
 
-$$Out = scale*X$$
+$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
 
 )DOC");
   }
@@ -103,10 +215,16 @@ $$Out = scale*X$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
+                  ops::FakeQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
+                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+                  ops::FakeQuantizeRangeAbsMaxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
+                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index be0c6730a5119090600a27c66510b2a095c54583..a0ff6396210c2b3a7f8bd6b9f274b875d7fd4933 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <string>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -20,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
@@ -43,7 +44,7 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
   __syncthreads();
 
   for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
       shared_max_data[tid] = shared_max_data[tid + i];
     }
     __syncthreads();
@@ -53,220 +54,125 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
   }
 }
 
-float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
-                    int length) {
-  float host_max;
-  int kNumTheads = 1024;
-  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
-                                            platform::CUDAPlace());
-  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
-                            ctx.stream()>>>(length, array, device_max);
-  FindAbsMaxKernel<
-      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
-      gridDimx, device_max, device_max);
-  PADDLE_ENFORCE_EQ(
-      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
-      cudaSuccess, "cudaMemcpy failed");
-  return host_max;
-}
+template <typename T>
+struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+
+    framework::Tensor max;
+    T* max_data =
+        max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
+    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, max_data);
+    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
+        max_data, grid, out);
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
 template <typename T>
-__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
-                                    int* num_saturate, const T min,
-                                    const T max) {
+__global__ void ClipAndQuantKernel(const T* in, const T* scale,
+                                   const int bin_cnt, const int n, T* out) {
   int bid = threadIdx.x + blockIdx.x * blockDim.x;
   int tid = threadIdx.x;
 
-  extern __shared__ int shared_count[];
-  shared_count[tid] = 0;
+  T s = scale[0];
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    if (in[i] > max) {
-      out[i] = max;
-      shared_count[tid] += 1;
-    } else if (in[i] < min) {
-      out[i] = min;
-      shared_count[tid] += 1;
-    } else {
-      out[i] = in[i];
-    }
-  }
-  __syncthreads();
-
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_count[tid] += shared_count[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    num_saturate[blockIdx.x] = shared_count[0];
+    T x = in[bid];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[bid] = round(v);
   }
 }
 
 template <typename T>
-__global__ void ReduceKernel(const int n, const T* in, T* out) {
-  int tid = threadIdx.x;
-  extern __shared__ T shared_sum[];
-  if (tid < n) {
-    shared_sum[tid] = in[tid];
+__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
+                                            const T* last_scale,
+                                            const int64_t* iter,
+                                            const int window_size, T* scale_arr,
+                                            T* out_scale, int* need_find_max,
+                                            int* out_size) {
+  int it = iter[0];
+  int idx = it % window_size;
+  T removed = scale_arr[idx];
+  T cur = cur_scale[0];
+  scale_arr[idx] = cur;
+  T max = last_scale[0];
+  out_scale[0] = max < cur ? cur : max;
+  if (fabs(removed - max) < 1e-6) {
+    need_find_max[0] = 1;
+    out_size[0] = it > window_size ? window_size : it;
   } else {
-    shared_sum[tid] = T(0);
-  }
-  __syncthreads();
-  // blockDim.x must >= n
-  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_sum[tid] += shared_sum[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[0] = shared_sum[0];
+    need_find_max[0] = 0;
   }
 }
 
 template <typename T>
-int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
-                     const T* in, T* out, const T min, const T max) {
-  int host_num_saturate;
-  int kNumTheads = 1024;
-  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  int* device_num_saturate = t.mutable_data<int>(
-      framework::make_ddim({gridDimx}), platform::CUDAPlace());
-  ApplySaturateKernel<
-      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      n, in, out, device_num_saturate, min, max);
-  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      gridDimx, device_num_saturate, device_num_saturate);
-  PADDLE_ENFORCE_EQ(cudaSuccess,
-                    cudaMemcpy(&host_num_saturate, device_num_saturate,
-                               sizeof(int), cudaMemcpyDeviceToHost),
-                    "cudaMemcpy failed");
-  return host_num_saturate;
-}
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
- public:
-  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
-                    framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
+struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
+    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    framework::Tensor need_find_max, out_size;
+    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
+        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
+        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+
+    int g_find_max;
+    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
+                 sizeof(int), 0);
+    if (g_find_max) {
+      int len;
+      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
+                   sizeof(int), 0);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
+                                                          out_scale_data);
     }
-    return max_scale;
-  }
-
-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
   }
+};
 
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto& device_ctx = context.cuda_device_context();
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-    context.Output<framework::Tensor>("OutMovingScale")
-        ->mutable_data<T>(
-            context.Input<framework::Tensor>("InMovingScale")->place());
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      context.Output<framework::Tensor>("OutScales")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InScales")->place());
-      context.Output<framework::Tensor>("OutCurrentIter")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InCurrentIter")->place());
-    }
-
-    T scale = T(1);
-    int window_size = context.Attr<int>("window_size");
-    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        auto* it = const_cast<framework::Tensor*>(
-            context.Input<framework::Tensor>("InCurrentIter"));
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
-                                window_size, current_iter[0]);
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-      }
-    }
-
-    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
-                        tensor->mutable_data<T>(in->place()), -scale, scale);
-    scale = bin_cnt / scale;
+template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (scale * eigen_in).round();
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
   }
 };
 
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(fake_quantize,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, double>);
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 80f71d85dde39f773cc489fb79effcc775c5010a..7ace7573ec5c03ab8788cfc0aab614b7f80ea073 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -17,137 +17,91 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
 
-using platform::Transform;
+template <typename DeviceContext, typename T>
+struct FindAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+};
 
 template <typename DeviceContext, typename T>
-class FakeQuantizeKernel : public framework::OpKernel<T> {
+struct ClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  framework::Tensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct FindRangeAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
-  T FindAbsMax(framework::Tensor* in, int n) const {
-    T* p = in->mutable_data<T>(platform::CPUPlace());
-    T abs_max = (T)0.00000001;
-    for (int i = 0; i < n; i++) {
-      T tmp = fabs(p[i]);
-      if (tmp > abs_max) abs_max = tmp;
-    }
-    return T(abs_max);
-  }
-  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMax(scale_list, size));
-    }
-    return max_scale;
-  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
 
-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const T* in_data = in->data<T>();
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
   }
+};
 
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+template <typename DeviceContext, typename T>
+class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-
-    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
-    oms_tensor->mutable_data<T>(in->place());
-
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
-      oss_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InScales")->place());
-      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
-      oci_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InCurrentIter")->place());
-    }
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
 
-    T scale = static_cast<T>(1);
-    int window_size = context.Attr<int>("window_size");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
 
-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto raw_in = framework::EigenVector<T>::Flatten(*in);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-      scale_out.device(dev) = raw_in.abs().maximum();
-      scale = scale_out(0);
-
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
-      if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* it = context.Input<framework::Tensor>("InCurrentIter");
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        const int* last_iter = it->data<int>();
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
-                                current_iter[0]);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
-      if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-      }
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
     }
 
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), in->data<T>(),
-          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
-          ClipFunctor<T>(-scale, scale));
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+    // training
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    auto* iter = context.Input<framework::Tensor>("Iter");
+
+    int window_size = context.Attr<int>("window_size");
+    out_scale->mutable_data<T>(context.GetPlace());
+
+    framework::Tensor cur_scale;
+    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
+                                               *iter, window_size, out_scales,
+                                               out_scale);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
   }
 };
 
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index fdda01381e117cecffb2a05f8399f3ad82a46339..8e80dc0e641c443923076c31e269689b5bc134a7 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
                   ops::FlattenOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 3a34aa86b6331e4fe2813eea97cb6644323807c3..582c75872ab2818cdf834f9a46278db1d6f91d54 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_gru_op.h"
+#include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Input(WeightH) of GRU should not be null.");
 
   PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                 "Output(BatchResetHiddenPrev) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                 "Output(ReorderedH0) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                 "Output(BatchedInput) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                 "Output(BatchedOut) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Output(Hidden) of GRU should not be null.");
 
@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   }
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
+  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedOut", out_dims);
   ctx->ShareLoD("X", "Hidden");
 
-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  }
   ctx->SetOutputDim("XX", {x_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
 }
@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
            "(Tensor) The FC weight with shape (M x 3D),"
            "where M is the dim size of x, D is the hidden size. ");
   AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+           "Acutally they are D x 2D and D x D two part weights."
+           "{W_update, W_reset; W_state}"
+           "{D x (D + D); D x D}");
   AddInput("Bias",
            "(Tensor, optional) (1 x 3D)."
            "Almost same as GRUOp."
            "Note: if have FC bias it should be added on this bias.")
       .AsDispensable();
+  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+      .AsIntermediate();
   AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
             " or batched_X (size is T x M), this will be automatically chosen,"
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size, M is the dim size of x input.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
-  AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
+  AddOutput("BatchedInput",
+            "(LoDTensor) This is the batched result of input X"
+            "or the batched result after fc, shape (T x 3D)")
       .AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
+  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
       .AsIntermediate();
   AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
   AddAttr<std::string>("activation",
@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
                 "(bool, defalut: False) "
                 "whether to compute reversed GRU.")
       .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute GRU.")
+      .SetDefault(true);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -153,172 +165,261 @@ more details can refer to GRU op.
 )DOC");
 }
 
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
+template <typename T>
 class FusionGRUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#define INIT_VEC_FUNC                                                     \
+  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
+  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
+  auto& act_state_str = ctx.Attr<std::string>("activation");              \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
+    math::VecActivations<T, platform::jit::avx> act_functor;              \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::avx>;                       \
+  } else {                                                                \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 3D*/ \
+  const int total_T = x_dims[0];         \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D3 = wh_dims[1];             \
+  const int D2 = D * 2;
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
     auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* h0 = ctx.Input<Tensor>("H0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* batch_reset_hidden_prev =
-        ctx.Output<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
 
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    const T* wh_state_data = wh_data + D * D2;
     T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
-    batch_hidden->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                      xx_data,
+                                      bias ? bias->data<T>() : nullptr);
+
+    int xx_offset = D3;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 3;
+      hidden_out_data = hidden_out_data + offset;
+      xx_offset = -D3;
+      gate_offset = -D;
+    }
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+    };
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_hidden_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+      } else {
+        // W: {W_update, W_reset; W_state}
+        // update gate
+        act_gate(D, xx_data, xx_data);
+        // state gate
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = a*b
+        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        tstart = 1;
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        // gemm prev * (Wu + Wr)
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
+                  D3);
+        act_gate(D2, xx_data, xx_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
+
+        // gemm rt * Ws
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
+                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
+                  xx_data + D2, D3);
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        move_step();
+      }
+    }
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
 
     const T* x_data = x->data<T>();
     const T* wx_data = wx->data<T>();
     const T* wh_data = wh->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
+    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias ? bias->data<T>() : NULL);
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D3) {
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                        xx_data,
+                                        bias ? bias->data<T>() : nullptr);
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
     } else {
       to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
-                                        bias ? bias->data<T>() : NULL);
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
+                                        batched_input_data,
+                                        bias ? bias->data<T>() : nullptr);
     }
 
-    int frame_size = static_cast<int>(wx_dims[1] / 3);
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(wh_data);
-    gru_value.state_weight =
-        const_cast<T*>(wh_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
 
+    int tstart = 0;
+    T* prev_hidden_data = nullptr;
     if (h0) {
-      ReorderInitState<DeviceContext, T>(
-          ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
-          true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
+      // reorder h0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      const T* h0_data = h0->data<T>();
+      prev_hidden_data = reordered_h0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+      }
     } else {
-      gru_value.prev_out_value = nullptr;
+      // compute without h0
+      T* cur_in_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      // W: {W_update, W_reset; W_state}
+      for (int i = 0; i < max_bs; ++i) {
+        // update gate
+        act_gate(D, cur_in_data, cur_in_data);
+        // state gate
+        act_state(D, cur_in_data + D2, cur_in_data + D2);
+        // out = a*b
+        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        // add offset
+        cur_in_data += D3;
+        cur_out_data += D;
+      }
+      tstart = 1;
+      prev_hidden_data = batched_out_data;
     }
-    auto batch_starts = batched_gate->lod()[0];
-    size_t seq_len = batch_starts.size() - 1;
-    auto active_node =
-        math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-
-#ifdef PADDLE_WITH_MKLML
-    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                       frame_size * 2 /*width of weight*/,
-                                       frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_gate);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
-                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
-                     packed_gate);
-      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                        frame_size /*width of weight*/,
-                                        frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_state);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
-                     frame_size, T(1.0), gru_value.state_weight, frame_size,
-                     packed_state);
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
-              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
-              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
-        }
-
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
-              gru_value.reset_output_value, frame_size, packed_state,
-              frame_size, T(1), gru_value.gate_value + frame_size * 2,
-              frame_size * 3);
-        }
-
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node);
-
-        gru_value.prev_out_value = gru_value.output_value;
+    // Then start from next
+    const T* wh_state_data = wh_data + D * D2;
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    batched_input_data = batched_input_data + tstart * max_bs * D3;
+    batched_out_data = batched_out_data + tstart * max_bs * D;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // gemm prev * (Wu + Wr)
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
+                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
+                batched_input_data, D3);
+
+      T* cur_batched_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      T* cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        act_gate(D2, cur_batched_data, cur_batched_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
       }
 
-      blas.GEMM_FREE(packed_gate);
-      blas.GEMM_FREE(packed_state);
-    } else {
-#endif
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
-            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate);
-
-        gru_value.prev_out_value = gru_value.output_value;
+      cur_batched_data = batched_input_data;
+      cur_out_data = batched_out_data;
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
+                cur_out_data, D, wh_state_data, D, static_cast<T>(1),
+                cur_batched_data + D2, D3);
+
+      cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        // ht~ = act_state(...)
+        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
+              cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
       }
-#ifdef PADDLE_WITH_MKLML
+      prev_hidden_data = batched_out_data;
+      batched_out_data = cur_out_data;
+      batched_input_data = cur_batched_data;
     }
-#endif
+
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batched_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden_out);
+    batched_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_out, hidden_out);
   }
+#undef INIT_VEC_FUNC
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
 };
 
 }  // namespace operators
@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
-    fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
+                       ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc
index e4e4ac8e333ba423e151dea05e40a0e41042570e..104e160e2d7069ec247cc51e927ce8824f1b69e8 100644
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -16,14 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-DEFINE_bool(seq_mode, true, "Use sequence mode");
-
 namespace paddle {
 namespace operators {
 
@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Output(Hidden) of LSTM should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                  "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                 "Output(BatchedInput) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                 "Output(BatchedHidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                 "Output(BatchedCell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                 "Output(ReorderedH0) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                 "Output(ReorderedC0) of LSTM should not be null.");
 
   auto x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -87,23 +89,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_EQ(b_dims[0], 1,
                     "The first dimension of Input(Bias) should be 1.");
 
-  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
-                 "Do not support peephole yet.");
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+  auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
+  PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
                     "The second dimension of Input(Bias) should be "
-                    "4 * %d if disable peepholes connection",
-                    frame_size);
+                    "7 * %d if enable peepholes connection or"
+                    "4 * %d if disable peepholes",
+                    frame_size, frame_size);
 
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedHidden", out_dims);
+  ctx->SetOutputDim("BatchedCell", out_dims);
   ctx->ShareLoD("X", "Hidden");
   ctx->ShareLoD("X", "Cell");
 
   int xx_width;
-  if (FLAGS_seq_mode) {
+  if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
     xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size, M is the dim size of x input.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
-  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
-      .AsIntermediate();
+  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
   AddAttr<bool>("use_peepholes",
                 "(bool, defalut: True) "
                 "whether to enable diagonal/peephole connections.")
@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
                 "(bool, defalut: False) "
                 "whether to compute reversed LSTM.")
       .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute.")
+      .SetDefault(true);
   AddAttr<std::string>("gate_activation",
                        "(string, default: sigmoid)"
                        "The activation for input gate, forget gate and output "
@@ -203,70 +212,76 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 )DOC");
 }
 
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  // TODO(TJ): check mem copy perf
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
  public:
+#define INIT_VEC_FUNC                                                          \
+  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  } else {                                                                     \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                          \
+  auto* x = ctx.Input<LoDTensor>("X");                  \
+  auto* h0 = ctx.Input<Tensor>("H0");                   \
+  auto* c0 = ctx.Input<Tensor>("C0");                   \
+  auto* wx = ctx.Input<Tensor>("WeightX");              \
+  auto* wh = ctx.Input<Tensor>("WeightH");              \
+  auto* bias = ctx.Input<Tensor>("Bias");               \
+  auto* xx = ctx.Output<LoDTensor>("XX");               \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D2 = D * 2;                  \
+  const int D3 = D * 3;                  \
+  const int D4 = wh_dims[1];
+
   void SeqCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
-
-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
-    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
-    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
 
     auto x_lod = x->lod();
-    auto x_dims = x->dims();    // T x M
-    auto wh_dims = wh->dims();  // D x 4D
     const int total_T = x_dims[0];
     const int N = x_lod[0].size() - 1;  // batch size
-    const int M = x_dims[1];            // x frame size
-    const int D = wh_dims[0];
-    const int D2 = D * 2;
-    const int D3 = D * 3;
-    const int D4 = wh_dims[1];
 
     const T* x_data = x->data<T>();
-    const T* h0_data = h0 ? h0->data<T>() : NULL;
-    const T* c0_data = c0 ? c0->data<T>() : NULL;
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* c0_data = c0 ? c0->data<T>() : nullptr;
+    const T* bias_data = bias->data<T>();
+    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
     const T* wx_data = wx->data<T>();
     const T* wh_data = wh->data<T>();
+
     T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
     T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
     T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
 
+    // use local variable
+    framework::DDim check_dims({3, D});
+    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
+    auto checked_cell_data =
+        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
+
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                       xx_data, bias->data<T>());
@@ -290,199 +305,319 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
     for (int i = 0; i < N; ++i) {
       int bid = is_reverse ? N - 1 - i : i;
       int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_cell_data = NULL;
-      const T* prev_hidden_data = NULL;
+      const T* prev_c_data = nullptr;
+      const T* prev_h_data = nullptr;
+
       int tstart = 0;
       if (h0_data) {
-        prev_hidden_data = h0_data + bid * D;
-        prev_cell_data = c0_data + bid * D;
+        prev_h_data = h0_data + bid * D;
+        prev_c_data = c0_data + bid * D;
       } else {
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
+        // If step == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros. Then W_h * H_t-1 can be skipped
+
+        // ~C_t
         act_cand(D, xx_data, xx_data);
-        // cell out= input*tilde
+        if (use_peepholes) {
+          // I_t, F_t
+          act_gate(D2, xx_data + D, xx_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, xx_data + D, xx_data + D);
+        }
+        // C_t = I_t * ~C_t
         blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
+          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
+          // O_t
+          act_gate(D, xx_data + D3, xx_data + D3);
+        }
+
         // hidden out= act_state(cellout) * outgate
         act_cell(D, cell_out_data, xx_data + D2);
+        // H_t = O_t * act_state(C_t)
         blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
 
         // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
-        tstart = 1;
+        prev_h_data = hidden_out_data;
+        prev_c_data = cell_out_data;
 
+        tstart = 1;
         move_step();
       }
+
       for (int step = tstart; step < seq_len; ++step) {
+        // + W_h * H_t-1
         blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
-                  D4);
+                  prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);
 
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
+        // ~C_t
         act_cand(D, xx_data, xx_data);
 
-        // a = forget * prev_cell
-        blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
-
-        // b = input * tilde
+        if (use_peepholes) {
+          // + W_ic|W_fc * C_t-1 for peephole connection
+          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+          blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
+          // I_t, F_t
+          act_gate(D2, xx_data + D, xx_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, xx_data + D, xx_data + D);
+        }
+
+        // F_t * C_t-1
+        blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
+        // I_t * ~C_t
         blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
-
-        // cell out= a+b
+        // C_t = F_t * C_t-1 + I_t * ~C_t
         blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
 
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
+          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
+          // O_t
+          act_gate(D, xx_data + D3, xx_data + D3);
+        }
+
         // hidden out= act_state(cellout) * outgate
         act_cell(D, cell_out_data, xx_data + D2);
+        // H_t = O_t * act_state(C_t)
         blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
 
         // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
+        prev_h_data = hidden_out_data;
+        prev_c_data = cell_out_data;
 
         move_step();
-      }
-    }
+      }  // for each step in batch
+    }    // for each batch
   }
 
   void BatchCompute(const framework::ExecutionContext& ctx) const {
     using DeviceContext = platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    INIT_BASE_INPUT_OUTPUT
+    if (x->lod()[0].size() == 2) {  // batch size == 1
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
 
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    cell_out->mutable_data<T>(ctx.GetPlace());
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
+    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
 
     const T* x_data = x->data<T>();
     const T* wx_data = wx->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    const T* wh_data = wh->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
+    auto place = ctx.GetPlace();
+    T* xx_data = xx->mutable_data<T>(place);
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
+    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
+    cell_out->mutable_data<T>(place);
+
+    // use local variable
+    framework::DDim check_dims({3, D});
+    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
+    auto checked_cell_data =
+        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
 
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D4) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
+                                        xx_data, bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
     } else {
       to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
+                                        wx_data, batched_input_data,
                                         bias->data<T>());
     }
 
-    int frame_size = static_cast<int>(wx_dims[1] / 4);
-    framework::DDim out_dims({x_dims[0], frame_size});
-    math::LstmMetaValue<T> lstm_value;
-    // no peephole
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
-                                         true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
-    }
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+    reordered_c0->Resize({max_bs, D});
+
+    T* prev_batch_h_data = nullptr;
+    T* prev_batch_c_data = nullptr;
+    T* cur_batch_in_data = batched_input_data;
+    T* cur_batch_h_out_data = batched_h_out_data;
+    T* cur_batch_c_out_data = batched_c_out_data;
+
+    auto move_step = [&](int bs) {
+      cur_batch_in_data += bs * D4;
+      cur_batch_c_out_data += bs * D;
+      cur_batch_h_out_data += bs * D;
+    };
+
+    int tstart = 0;
+    if (h0) {
+      // reorder h0, c0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
+      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
+      const T* h0_data = h0->data<T>();
+      const T* c0_data = c0->data<T>();
+      prev_batch_h_data = reordered_h0_data;
+      prev_batch_c_data = reordered_c0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+        reordered_c0_data += D;
+      }
+    } else {
+      // Compute with no H0/C0
+      T* cur_in_data = cur_batch_in_data;
+      T* cur_c_out_data = cur_batch_c_out_data;
+      T* cur_h_out_data = cur_batch_h_out_data;
+
+      // If step == 0 and there is no initialized hidden state, that is to say
+      // the H0 is zeros. Then W_h * H_t-1 can be skiped
+
+      for (int i = 0; i < max_bs; ++i) {  // iterate each data in 1st batch
+        // ~C_t
+        act_cand(D, cur_in_data, cur_in_data);
+
+        if (use_peepholes) {
+          // I_t, F_t
+          act_gate(D2, cur_in_data + D, cur_in_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, cur_in_data + D, cur_in_data + D);
+        }
+
+        // C_t = I_t * ~C_t
+        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
+                    cur_in_data + D3);
+          // O_t
+          act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        }
 
-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
-
-    auto batch_starts = batched_gate->lod()[0];
-    size_t max_seq_len = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    for (size_t n = 0; n < max_seq_len; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batched_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // TODO(TJ): move h0 outside for
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
-                    static_cast<T>(1.0));
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cur_c_out_data, cur_in_data + D2);
+        // H_t = O_t * act_state(C_t)
+        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
+
+        // move to next data in the same batch
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
       }
 
-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
-          cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
+      // move to data for next timestep
+      prev_batch_h_data = cur_batch_h_out_data;
+      prev_batch_c_data = cur_batch_c_out_data;
+      move_step(max_bs);
+      tstart = 1;
     }
 
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batched_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(dev_ctx, batch_hidden, hidden_out);
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // + W_h * H_t-1
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
+                prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
+                cur_batch_in_data, D4);
+
+      T* cur_in_data = cur_batch_in_data;
+      T* cur_c_out_data = cur_batch_c_out_data;
+      T* cur_h_out_data = cur_batch_h_out_data;
+      T* prev_c_data = prev_batch_c_data;  // NULL if no C0 in step0
+      T* prev_h_data = prev_batch_h_data;  // NULL if no H0 in step0
+      auto next_data_in_batch = [&]() {
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+        prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
+        prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
+      };
+
+      for (int i = 0; i < cur_bs; ++i) {  // iterate each data in same batch
+        // ~C_t
+        act_cand(D, cur_in_data, cur_in_data);
+
+        if (use_peepholes) {
+          // + W_ic|W_fc * C_t-1 for peephole connection
+          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+          blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
+          // I_t, F_t
+          act_gate(D2, cur_in_data + D, cur_in_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, cur_in_data + D, cur_in_data + D);
+        }
+
+        // F_t * C_t-1
+        blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
+        // I_t * ~C_t
+        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
+        // C_t = F_t * C_t-1 + I_t * ~C_t
+        blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
+                    cur_in_data + D3);
+          // O_t
+          act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        }
+
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cur_c_out_data, cur_in_data + D2);
+        // H_t = O_t * act_state(C_t)
+        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
 
-    batch_cell.set_lod(batched_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(dev_ctx, batch_cell, cell_out);
+        // move to next data in same batch
+        next_data_in_batch();
+      }
+      // move to data for next timestep
+      prev_batch_h_data = cur_batch_h_out_data;
+      prev_batch_c_data = cur_batch_c_out_data;
+      move_step(cur_bs);
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batched_h_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_h_out, hidden_out);
+    batched_c_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_c_out, cell_out);
   }
+
   void Compute(const framework::ExecutionContext& ctx) const override {
-    if (FLAGS_seq_mode) {
+    if (ctx.Attr<bool>("use_seq")) {
       SeqCompute(ctx);
     } else {
       BatchCompute(ctx);
     }
   }
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_VEC_FUNC
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 2d9faed648aef78da60706e13db3862080c96514..451ec61ba1f7239d92c6dfbad0b2961e74e1bc17 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               gate_data, frame_size * 3);
 
     // calculate activited gate
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(u_offsets, extents), g.slice(u_offsets, extents));
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     ActCompute(context.Attr<int>("gate_activation"), place,
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               weight_data + frame_size * frame_size * 2, frame_size, 1,
               gate_data + frame_size * 2, frame_size * 3);
 
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     ActCompute(context.Attr<int>("activation"), place,
                g.slice(c_offsets, extents), g.slice(c_offsets, extents));
     auto c = g.slice(c_offsets, extents);  // output candidate
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     int batch_size = input->dims()[0];
     int frame_size = hidden_prev->dims()[1];
 
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
     auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
     auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // backward for unactivated update gate
diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h
index f56fd95e96526c59e040fbbd2812360e59570a08..f3da17de011053fa118b5a4257bb5c3b00084741 100644
--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
       auto dist = framework::EigenVector<T>::Flatten(*dist_t);
       out.device(dev) =
           static_cast<T>(1 - epsilon) * in +
-          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
     } else {
       out.device(dev) = static_cast<T>(1 - epsilon) * in +
                         static_cast<T>(epsilon / label_dim);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 0886c41a1b582881faf24f5531d414db4e4db71c..22343d7724b2f0dc01bff8c2274e3dd914bf70ef 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                  T *y, T *mean, T *var, float epsilon,
                                  int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   int beg_idx = blockIdx.x * feature_size + threadIdx.x;
   int end_idx = (blockIdx.x + 1) * feature_size;
 
   // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
-  T var_val = static_cast<T>(0);
+  double mean_val = 0;
+  double var_val = 0;
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
     T tmp = x[i];
     mean_val += tmp;
     var_val += (tmp * tmp);
   }
   auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<double>());
   if (threadIdx.x == 0) {
     auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
   }
   __syncthreads();
   mean_val = mean[blockIdx.x];
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7..b3f7e0c0097b469998049a1db65d56a28cf02b5e 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index f5c10ced8305b64c6386c5051804f8c9a8f71802..58463dc4d6fd7cc3454de766814a947fee161070 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
           memcpy(output + i * row_width, table + ids[i] * row_width,
                  row_width * sizeof(T));
         }
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 1b75df5d7d97e54dfdc461660e53a368311e3778..d7f0f3c6280db7d121bf8821ec6d578e22a33da6 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,4 +1,6 @@
+if (NOT WIN32)
 add_subdirectory(detail)
+endif(NOT WIN32)
 
 function(math_library TARGET)
     # math_library is a function to create math library. 
@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
-math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
+
+if (NOT WIN32) # windows do not support avx functions yet.
+math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+endif (NOT WIN32)
+
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
@@ -51,7 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
 math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 5693761e9ffd96b40040223b5498b63b0274bf0f..9560e3a3c15ca63892fbe3552679a22f027f11e2 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
   vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }
 
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
+                                                 const float* y, const float* z,
+                                                 float* out) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
+                                                  const float* y,
+                                                  const float* z, float* out) {
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx512_common>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 5454d58f371afb5f5d6a1c3208318f80d4e0aa36..07854c83584f90db02b416b85a4aa61f5cdc0685 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif  // _WIN32
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -55,12 +60,38 @@ namespace math {
  *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
  * \f]
  */
+#if !defined(_WIN32)
 inline constexpr size_t FindLastSet(size_t x) {
   return std::is_same<size_t, unsigned int>::value
              ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
              : (std::is_same<size_t, unsigned long>::value  // NOLINT
                     ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                     : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+
+#else
+// windows don't have built-in clz, ctz function
+template <typename T>
+inline int ctz(const T& value) {
+  DWORD trailing_zero = 0;
+  if (_BitScanForward(&trailing_zero, value)) {
+    return static_cast<int>(trailing_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+template <typename T>
+inline int clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
+#endif  // !_WIN32
 }
 
 struct SimpleCode {
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 4166fb54946b7082f5f7dc0e232f636a1d2f8a13..e4d378dc23210e95605c6e09eda8a190cc5c6b4f 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
 class MaxOutFunctor {
  public:
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 2538d739cce95d1b2fc5b3f905af5e6d94cf7af5..120f5919803806e0d3b7dc8eaf530ae89819b84d 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,15 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-  __FLT_MAX__  // TODO(zcd) :It might need to be placed in another file, but I'm
-               // still wondering where to put it.
-
 /*
  * \brief Extracting simple operations from pooling.
  *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 07372235a7c23832e528c3e852a4747f4244b833..a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
+    //           max_seqlen = 5,
     //           batchIndex = {b0, b1, b2, b3, b4}
     //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
     //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
     //               where 1 is the second sequence,
     //                     0 is the first sequence,
     //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
+    // The max_seqlen represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
     batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    int max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
     batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
     // batch_lods[2] is the sort order for the input LoDTensor.
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
     batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
+    for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
       for (size_t i = 0; i < seq_info.size(); ++i) {
         int seq_len = seq_info[i].length;
diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h
index f9076cbc678534fd5490fa0d7adeac0e50909a39..12f1525594ecf0887618616ffe563bd2bda32496 100644
--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
     auto dim = x->dims();
     int index = 0;
     int i = 0;
-    int temp = 0;
     if (mode == "channel") {
+      int temp = numel / (dim[0] * dim[1]);
       for (i = 0; i < numel; i++) {
-        temp = numel / (dim[0] * dim[1]);
         index = (i / temp) % dim[1];
         o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
       }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea..d72f85f2c44db2fa887732cfc05e1376a6a79e4a 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -246,6 +246,88 @@ class ReshapeGradKernel {
   }
 };
 
+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
 
+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                 ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                 double, ops::ReshapeGradKernel, int,
                                 ops::ReshapeGradKernel, int64_t,
                                 ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index 919ebe48ca38040274bd2052b95ef96eccff4db6..2f773f222e50a440801b06a4fd997bf237b34772 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                    "Output(param_out) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
+                   "Output(MomentOut) of RmspropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
                    "Output(MeanSquareOut) of RmspropOp should not be null.");
+    if (ctx->Attrs().Get<bool>("centered")) {
+      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
+                     "Output(MeanGradOut) of RmspropOp should not be null.");
+    }
 
     auto param_dim = ctx->GetInputDim("Param");
     PADDLE_ENFORCE_EQ(
@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
     ctx->SetOutputDim("MeanSquareOut", param_dim);
+    if (ctx->Attrs().Get<bool>("centered")) {
+      ctx->SetOutputDim("MeanGradOut", param_dim);
+    }
   }
 };
 
@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("MeanSquare",
              "(Tensor, default Tensor<float>)"
              " The mean square value that gets updated.");
+    AddInput("MeanGrad",
+             "(Tensor, default Tensor<float>)"
+             " The moving average of gradient")
+        .AsDispensable();
     AddInput("LearningRate",
              "(Tensor, default Tensor<float>) "
              "The learning rate should be a tensor of size 1.");
@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
     AddOutput("MomentOut", "(Tensor) Output updated moment.");
     AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+    AddOutput("MeanGradOut",
+              "(Tensor) Output moving average of gradient updated value.");
 
     AddAttr<float>("epsilon",
                    "(float, default 1e-10) Constant "
@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.9f);
     AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
         .SetDefault(0.0f);
+    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Rmsprop Optimizer. 
 
@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
 ParamOut = Param -  MomentOut
 $$
 
+if centered is true:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+param -= mom
+
 The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 
diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h
index 12836f43bde47ac87eb0af33dea501593b659a5d..25ed32c5ebb2ff5be962ac1e3e38c970623d705c 100644
--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     float epsilon = ctx.Attr<float>("epsilon");
     float rho = ctx.Attr<float>("decay");
     float momentum = ctx.Attr<float>("momentum");
+    bool centered = ctx.Attr<bool>("centered");
 
     auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
     auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
     auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));
 
     ms_out.device(place) = rho * ms + (1 - rho) * g * g;
-    mom_out.device(place) =
-        momentum * mom +
-        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    if (centered) {
+      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
+      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) = momentum * mom +
+                              lr.broadcast(grad_dsize) * g /
+                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
+      mom_out.device(place) =
+          momentum * mom +
+          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    }
     p_out.device(place) = p - mom_out;
   }
 };
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e..46e20285db6d7acd39dead3994409645adddf494 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {
 
 template <typename T>
 __global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const int64_t* input_rois,
+    const int nthreads, const T* input_data, const T* input_rois,
     const float spatial_scale, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
     int c = (i / pooled_width / pooled_height) % channels;
     int n = i / pooled_width / pooled_height / channels;
 
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    const T* offset_input_rois = input_rois + n * kROISize;
     int roi_batch_ind = roi_batch_id_data[n];
     int roi_start_w = round(offset_input_rois[0] * spatial_scale);
     int roi_start_h = round(offset_input_rois[1] * spatial_scale);
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int nthreads, const T* input_rois, const T* output_grad,
     const int64_t* argmax_data, const int num_rois, const float spatial_scale,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, int* roi_batch_id_data,
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
 
     GPUROIPoolForward<
         T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width,
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
+        height, width, pooled_height, pooled_width,
         roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       if (output_grad_size > 0) {
         GPUROIPoolBackward<
             T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            output_grad_size, rois->data<T>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
             width, pooled_height, pooled_width,
             roi_batch_id_list_gpu.data<int>(),
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f..07de7c9f0e070cef7c6f38f8d564ab76910842db 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(ctx.GetPlace());
     int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
 
-    const int64_t* rois_data = rois->data<int64_t>();
+    const T* rois_data = rois->data<T>();
     for (int n = 0; n < rois_num; ++n) {
       int roi_batch_id = roi_batch_id_data[n];
       int roi_start_w = round(rois_data[0] * spatial_scale);
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      const int64_t* rois_data = rois->data<int64_t>();
+      const T* rois_data = rois->data<T>();
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/sampling_id_op.h b/paddle/fluid/operators/sampling_id_op.h
index 01308e416a9313bad13ded4e40c79bb0550e31ed..133d3f72dbd6ab13c98d124369038309c94cba5b 100644
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
         static_cast<T>(context.Attr<float>("min")),
         static_cast<T>(context.Attr<float>("max")));
 
-    std::vector<T> ids(batch_size);
+    std::vector<int64_t> ids(batch_size);
     for (int i = 0; i < batch_size; ++i) {
       T r = dist(engine);
       int idx = width - 1;
@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
           break;
         }
       }
-      ids[i] = ins_vector[idx];
+      ids[i] = int64_t(idx);
     }
 
     std::vector<int64_t> out_dim;
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index cfee9207083b46f7c27354f22e82a7d3c38a027c..5b05f757c0355ed15617dea925b5d4929fcbfee0 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 #include <sstream>
@@ -23,40 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
 
-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveCombineOp : public framework::OperatorBase {
  public:
   SaveCombineOp(const std::string &type,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 85de37416b5f24128ee98320a872eafffe967c81..e79cffcf498c52ed14db235f6221cfdf08399c9d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 
@@ -25,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -33,36 +33,6 @@ namespace operators {
 // to directory specified.
 constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
 
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveOp : public framework::OperatorBase {
  public:
   SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_enumerate_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58e48c228bb34814700fd0f7a3d62ef4b1a435dd
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
@@ -0,0 +1,97 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEnumerateOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of SequecceEnumerate operator should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(X) of SequenceEnumerate operator should not be null.");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], 1UL,
+        "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
+
+    const auto win_size = ctx->Attrs().Get<int>("win_size");
+    ctx->SetOutputDim("Out", {x_dims[0], win_size});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dimension equal to 1) "
+             "Input LoDTensor of SequenceEnumerate operator.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
+              "Output LoDTensor of SequenceEnumerate operator.");
+    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
+        .AddCustomChecker([](const int& win_size) {
+          PADDLE_ENFORCE(win_size >= 2,
+                         "The window size should be not less than 2.");
+        });
+    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Sequence Enumerate Operator.
+
+Generate a new sequence for the input index sequence, which enumerates all the
+sub-sequences with length `win_size` of the input. 
+The enumerated sequence has the same 1st dimension with variable `input`, and
+the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+Examples:
+Case 1:
+  Input:
+    X.lod = [[0, 3, 5]]
+    X.data = [[1], [2], [3], [4], [5]]
+    X.dims = [5, 1]
+  Attrs:
+    win_size = 2
+    pad_value = 0
+  Output:
+    Out.lod = [[0, 3, 5]]
+    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+    Out.dims = [5, 2]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
+                             ops::SequenceEnumerateOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_enumerate,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_enumerate_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdc9a615aa9a1ecd99c1f6995361f8c5ff0aa383
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
@@ -0,0 +1,84 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
+                           const size_t lod_len, const int64_t win_size,
+                           const int64_t pad_value, T* out_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_lod[lod_len - 1]) {
+    int end_idx = 0;
+    // Get LoD interval of index
+    for (int i = 1; i < lod_len; ++i) {
+      if (index < in_lod[i]) {
+        end_idx = in_lod[i];
+        break;
+      }
+    }
+    for (size_t i = 0; i < win_size; ++i) {
+      int word_pos = index + i;
+      out_data[index * win_size + i] =
+          word_pos < end_idx ? in_data[word_pos] : pad_value;
+    }
+  }
+}
+
+template <typename T>
+class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    /* Generate enumerate sequence set */
+    auto stream = context.cuda_device_context().stream();
+    auto lod0 = in_lod[0];
+    auto in_len = in->numel();
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    // Copy LoD to GPU
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    // Calc output tensor
+    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_enumerate,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/sequence_enumerate_op.h b/paddle/fluid/operators/sequence_enumerate_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc18d9b2071303377505155476b87ed029eaf986
--- /dev/null
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
@@ -0,0 +1,56 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceEnumerateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    // Generate enumerate sequence set
+    auto lod0 = in_lod[0];
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
+        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
+          size_t word_pos = idx + word_idx;
+          out_data[win_size * idx + word_idx] =
+              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index 8a683116b8054de12fc4419b5aa5fbc019b675bb..e389c6a65e1e8220685294931c4d08e6fd928b7f 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -126,15 +126,15 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault({});
     AddComment(R"DOC(
         Squeeze Operator.
-        
-        Remove single-dimensional entries from the shape of a tensor. 
-        Takes a parameter axes with a list of axes to squeeze. 
-        If axes is not provided, all the single dimensions will be removed from the shape. 
+
+        Remove single-dimensional entries from the shape of a tensor.
+        Takes a parameter axes with a list of axes to squeeze.
+        If axes is not provided, all the single dimensions will be removed from the shape.
         If an axis is selected with shape entry not equal to one, an error is raised.
-        
+
         Examples:
         Case 1:
-          Given 
+          Given
             X.shape = (1, 3, 1, 5)
           and
             axes = [0]
@@ -144,7 +144,7 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         Case 2:
           Given
             X.shape = (1, 3, 1, 5)
-          and 
+          and
             axes = []
           we get:
             Out.shape = (3, 5)
@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeOpInferShape,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 60556a564c25c08612447ebd47a4b432b8a12d29..6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
     auto x_dims = ctx->GetInputDim("X");
@@ -90,7 +91,7 @@ The behavior of this operator is similar to how `numpy.transpose` works.
          2 &5
     \end{pmatrix}$$
 
-- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
 $[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
 
 )DOC");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc
index bcd1fb631394bc33b6fc162cfa7cbb20d55a654b..c1b5a8b31be243fab3af06a18c8e51986c953700 100644
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 0fc8d54f6400c9dfb6af1e764ed44e95195bfe6e..405943add238ac2d245df11127bfadb4899e855f 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -127,13 +127,13 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddComment(R"DOC(
     Unsqueeze Operator.
-    
-    Insert single-dimensional entries to the shape of a tensor. 
-    Takes one required argument axes, a list of dimensions that will be inserted. 
-    Dimension indices in axes are as seen in the output tensor. 
 
-    For example: 
-      Given a tensor such that tensor with shape [3, 4, 5], 
+    Insert single-dimensional entries to the shape of a tensor.
+    Takes one required argument axes, a list of dimensions that will be inserted.
+    Dimension indices in axes are as seen in the output tensor.
+
+    For example:
+      Given a tensor such that tensor with shape [3, 4, 5],
       then Unsqueeze(tensor, axes=[0, 4]) has shape [1, 3, 4, 5, 1]
     )DOC");
   }
@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
   }
 };
 
+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                   ops::UnsqueezeGradInferShape);
+
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h
index 4cc04b090519637ab0b8d3740b8a12f216218cae..32b7efc04c1f2ecc22f93c08387aec69ded4930a 100644
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cfloat>
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
@@ -23,3 +24,7 @@ limitations under the License. */
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
 #endif
+
+#if defined(__FLT_MAX__)
+#define FLT_MAX __FLT_MAX__
+#endif  // __FLT_MAX__
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index a0a2d29500e7afbe8a9a43f010d5fd2d0c560467..cf9f4aa95bc1cb79d95b79331fbc09e11af64194 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,24 +14,141 @@
 
 #pragma once
 
+#include <cstdio>
 #include <stdexcept>
+
+#include <memory>
 #include <string>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"
+
 #if !defined(_WIN32)
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
+#define UNUSED __attribute__((unused))
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <algorithm>  // std::accumulate
 #else
-#include <Shlwapi.h>
-#include <Windows.h>
+#include <io.h>  // _popen, _pclose
+#include <windows.h>
+#if defined(_WIN32)
+#include <numeric>  // std::accumulate in msvc
+#endif
+// windows version of __attribute__((unused))
+#define UNUSED __pragma(warning(suppress : 4100))
 
-static void* dlsym(void* handle, const char* symbol_name) {
+#ifndef S_ISDIR  // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
   FARPROC found_symbol;
   found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
 
   if (found_symbol == NULL) {
     throw std::runtime_error(std::string(symbol_name) + " not found.");
   }
-  return reinterpret_cast<void*>(found_symbol);
+  return reinterpret_cast<void *>(found_symbol);
 }
 
-#endif
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  file_name.replace(0, file_name.size() - 1, '/', '\\');
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
+  }
+  return reinterpret_cast<void *>(hModule);
+}
+
+#endif  // !_WIN32
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+#if !defined(_WIN32)
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+static bool PathExists(const std::string &path) {
+#if !defined(_WIN32)
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#else
+  struct _stat statbuf;
+  if (_stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#endif  // !_WIN32
+  return false;
+}
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+#if !defined(_WIN32)
+constexpr char kSEP = '/';
+#else
+constexpr char kSEP = '\\';
+#endif  // _WIN32
+
+static bool FileExists(const std::string &filepath) {
+#if !defined(_WIN32)
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+#else
+  struct _stat buffer;
+  return (_stat(filepath.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  std::string path_error(path);
+  path_error += " mkdir failed!";
+#if !defined(_WIN32)
+  if (mkdir(path, 0755)) {
+    if (errno != EEXIST) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#else
+  CreateDirectory(path, NULL);
+  auto errorno = GetLastError();
+  if (errorno != ERROR_ALREADY_EXISTS) {
+    throw std::runtime_error(path_error);
+  }
+#endif  // !_WIN32
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7199424b4709fbe9fc962cf98aea6223b9f3e51d..84f9d6671a80889c4ff92832f546d51ef4352007 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -115,6 +115,7 @@ function cmake_gen() {
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DPY_VERSION=${PY_VERSION:-2.7}
     ========================================
@@ -144,6 +145,7 @@ EOF
         -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DPY_VERSION=${PY_VERSION:-2.7}
 }
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 3d2ef566173f81b29a6d8ea79cff00991a4ef3c4..a9b94a20720615dbfca97749463f27dbc88ac64f 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -98,10 +98,9 @@ class Inferencer(object):
             raise ValueError(
                 "inputs should be a map of {'input_name': input_var}")
 
-        with executor.scope_guard(self.scope):
-            results = self.exe.run(self.inference_program,
-                                   feed=inputs,
-                                   fetch_list=[self.predict_var],
+        with self._prog_and_scope_guard():
+            results = self.exe.run(feed=inputs,
+                                   fetch_list=[self.predict_var.name],
                                    return_numpy=return_numpy)
 
         return results
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 5757b2798e43dc70b406462a74b4f74eedcf56fa..1bc1dbbecaccd328d84cd3364a50c8f828d823c0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
     """
 
     helper = LayerHelper('rpn_target_assign', **locals())
-    # 1. Compute the regression target bboxes
-    target_bbox = box_coder(
-        prior_box=anchor_box,
-        prior_box_var=anchor_var,
-        target_box=gt_box,
-        code_type='encode_center_size',
-        box_normalized=False)
-    # 2. Compute overlaps between the prior boxes and the gt boxes overlaps
+    # Compute overlaps between the prior boxes and the gt boxes overlaps
     iou = iou_similarity(x=gt_box, y=anchor_box)
-    # 3. Assign target label to anchors
-    loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    # Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype='int32')
+    score_index = helper.create_tmp_variable(dtype='int32')
+    target_label = helper.create_tmp_variable(dtype='int64')
+    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
     helper.append_op(
         type="rpn_target_assign",
-        inputs={'DistMat': iou},
+        inputs={'Anchor': anchor_box,
+                'GtBox': gt_box,
+                'DistMat': iou},
         outputs={
             'LocationIndex': loc_index,
             'ScoreIndex': score_index,
-            'TargetLabel': target_label
+            'TargetLabel': target_label,
+            'TargetBBox': target_bbox,
         },
         attrs={
             'rpn_batch_size_per_im': rpn_batch_size_per_im,
@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
             'fg_fraction': fg_fraction
         })
 
-    # 4. Reshape and gather the target entry
-    scores = nn.reshape(x=scores, shape=(-1, 2))
-    loc = nn.reshape(x=loc, shape=(-1, 4))
-    target_label = nn.reshape(x=target_label, shape=(-1, 1))
-    target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
+    loc_index.stop_gradient = True
+    score_index.stop_gradient = True
+    target_label.stop_gradient = True
+    target_bbox.stop_gradient = True
 
+    scores = nn.reshape(x=scores, shape=(-1, 1))
+    loc = nn.reshape(x=loc, shape=(-1, 4))
     predicted_scores = nn.gather(scores, score_index)
     predicted_location = nn.gather(loc, loc_index)
-    target_label = nn.gather(target_label, score_index)
-    target_bbox = nn.gather(target_bbox, loc_index)
+
     return predicted_scores, predicted_location, target_label, target_bbox
 
 
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 0182bbeb637ec7b6a341a4822a1cc5fb5aef077d..b1598bfec210474ae1e17f9f88e8b57aa80b8452 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
     return acc_out
 
 
-def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
+def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
     """
     helper = LayerHelper("auc", **locals())
     auc_out = helper.create_tmp_variable(dtype="float64")
+    batch_auc_out = helper.create_tmp_variable(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    tn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    for var in [tp, tn, fp, fn]:
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+
+    for var in [stat_pos, stat_neg]:
         helper.set_variable_initializer(
             var, Constant(
                 value=0.0, force_cpu=True))
@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
         inputs={
             "Predict": [input],
             "Label": [label],
-            "TP": [tp],
-            "TN": [tn],
-            "FP": [fp],
-            "FN": [fn]
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg]
         },
         attrs={"curve": curve,
                "num_thresholds": num_thresholds},
         outputs={
             "AUC": [auc_out],
-            "TPOut": [tp],
-            "TNOut": [tn],
-            "FPOut": [fp],
-            "FNOut": [fn]
+            "BatchAUC": [batch_auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg]
         })
-    return auc_out, [tp, tn, fp, fn]
+    return auc_out, batch_auc_out, [stat_pos, stat_neg]
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 0ecfc958a3b89c85ef00574d630042d410c3fa0a..5f49d5bbff53096ece140a185f73722870924677 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -111,6 +111,7 @@ __all__ = [
     'stack',
     'pad2d',
     'unstack',
+    'sequence_enumerate',
 ]
 
 
@@ -3545,11 +3546,6 @@ def topk(input, k, name=None):
 
             top5_values, top5_indices = layers.topk(input, k=5)
     """
-    shape = input.shape
-    if k < 1 or k >= shape[-1]:
-        raise ValueError("k must be greater than 0 and less than %d." %
-                         (shape[-1]))
-
     helper = LayerHelper("top_k", **locals())
     values = helper.create_tmp_variable(dtype=input.dtype)
     indices = helper.create_tmp_variable(dtype="int64")
@@ -4029,10 +4025,12 @@ def transpose(x, perm, name=None):
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
     helper.append_op(
-        type='transpose',
+        type='transpose2',
         inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
         attrs={'axis': perm})
     return out
 
@@ -4524,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                 "Each dimension size given in shape must not be negtive "
                 "except one unknown dimension.")
 
-    helper = LayerHelper("reshape", **locals())
+    helper = LayerHelper("reshape2", **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
-        type="reshape",
+        type="reshape2",
         inputs=inputs,
         attrs={"shape": shape},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return helper.append_activation(out)
 
@@ -4574,11 +4574,13 @@ def squeeze(input, axes, name=None):
     """
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type="squeeze",
+        type="squeeze2",
         inputs={"X": input},
         attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return out
 
@@ -4609,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
     """
     helper = LayerHelper("unsqueeze", **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type="unsqueeze",
+        type="unsqueeze2",
         inputs={"X": input},
         attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})
 
     return out
 
@@ -5815,14 +5819,61 @@ def flatten(x, axis=1, name=None):
         raise ValueError("The axis should be a int, and in range [0, rank(x)]")
 
     out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
     helper.append_op(
-        type='flatten',
+        type='flatten2',
         inputs={"X": x},
-        outputs={'Out': out},
+        outputs={'Out': out,
+                 'XShape': x_shape},
         attrs={"axis": axis})
     return out
 
 
+def sequence_enumerate(input, win_size, pad_value=0, name=None):
+    """
+    Generate a new sequence for the input index sequence, which enumerates all the
+    sub-sequences with length `win_size` of the input. 
+    The enumerated sequence has the same 1st dimension with variable `input`, and
+    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+    Examples:
+    Case 1:
+      Input:
+        X.lod = [[0, 3, 5]]
+        X.data = [[1], [2], [3], [4], [5]]
+        X.dims = [5, 1]
+      Attrs:
+        win_size = 2
+        pad_value = 0
+      Output:
+        Out.lod = [[0, 3, 5]]
+        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+        Out.dims = [5, 2]
+
+    Args:
+        input (Variable): The input variable which is a index sequence.
+        win_size (int): The window size for enumerating all sub-sequences.
+        pad_value (int): The padding value, default 0.
+
+    Returns:
+        Variable: The enumerate sequence variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
+            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+    """
+    helper = LayerHelper('sequence_enumerate', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='sequence_enumerate',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'win_size': win_size,
+               'pad_value': pad_value})
+
+
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
     """
     **SequenceMask Layer**
@@ -5902,6 +5953,7 @@ def stack(x, axis=0):
     helper.append_op(
         type='stack', inputs={'X': x}, outputs={'Y': out},
         attrs={'axis': axis})
+
     return out
 
 
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 592cb23eb9319658f8542ed5bc6ab3e95cfdb118..0c2800dcf35ed156b71625babea2724f520575e5 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -558,8 +558,6 @@ class Auc(MetricBase):
         name: metric name
         curve: Specifies the name of the curve to be computed, 'ROC' [default] or
           'PR' for the Precision-Recall-curve.
-        num_thresholds: The number of thresholds to use when discretizing the roc
-            curve.
 
     "NOTE: only implement the ROC curve type via Python now."
 
@@ -574,15 +572,14 @@ class Auc(MetricBase):
                 numpy_auc = metric.eval()
     """
 
-    def __init__(self, name, curve='ROC', num_thresholds=200):
+    def __init__(self, name, curve='ROC', num_thresholds=4095):
         super(Auc, self).__init__(name=name)
         self._curve = curve
         self._num_thresholds = num_thresholds
-        self._epsilon = 1e-6
-        self.tp_list = np.zeros((num_thresholds, ))
-        self.fn_list = np.zeros((num_thresholds, ))
-        self.tn_list = np.zeros((num_thresholds, ))
-        self.fp_list = np.zeros((num_thresholds, ))
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = [0] * _num_pred_buckets
+        self._stat_neg = [0] * _num_pred_buckets
 
     def update(self, preds, labels):
         if not _is_numpy_(labels):
@@ -590,41 +587,32 @@ class Auc(MetricBase):
         if not _is_numpy_(preds):
             raise ValueError("The 'predictions' must be a numpy ndarray.")
 
-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
-                      for i in range(self._num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-        # calculate TP, FN, TN, FP count
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
-            for i, lbl in enumerate(labels):
-                if lbl:
-                    if preds[i, 1] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
-                else:
-                    if preds[i, 1] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            self.tp_list[idx_thresh] += tp
-            self.fn_list[idx_thresh] += fn
-            self.tn_list[idx_thresh] += tn
-            self.fp_list[idx_thresh] += fp
+        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
+            if lbl:
+                self._stat_pos[bin_idx] += 1.0
+            else:
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0
 
     def eval(self):
-        epsilon = self._epsilon
-        num_thresholds = self._num_thresholds
-        tpr = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fn_list + epsilon)
-        fpr = self.fp_list.astype("float32") / (
-            self.fp_list + self.tn_list + epsilon)
-        rec = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
-        return auc_value
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 33d6311b9717c66f0d6782eb6b3e348cd4c02a69..215f0cf2fc5ab4fbd06719ac4790a01dd00080eb 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):
 
         r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
 
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
             \\epsilon}} \\nabla Q_{i}(w)
 
         w & = w - v(w, t)
@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
             avoid division by zero, set 1e-6 by default.
         momentum(float): :math:`\\beta` in equation is the momentum term,
             set 0.0 by default.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.
 
     Raises:
         ValueError: If learning_rate, rho, epsilon, momentum are None.
@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):
 
     _momentum_acc_str = "momentum"
     _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"
 
     def __init__(self,
                  learning_rate,
                  rho=0.95,
                  epsilon=1.0e-6,
                  momentum=0.0,
+                 centered=False,
                  **kwargs):
         super(RMSPropOptimizer, self).__init__(
             learning_rate=learning_rate, **kwargs)
@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
         self._rho = rho
         self._epsilon = epsilon
         self._momentum = momentum
+        self._centered = centered
 
     def _create_accumulators(self, block, parameters):
         if not isinstance(block, framework.Block):
@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
         for p in parameters:
             self._add_accumulator(self._momentum_acc_str, p)
             self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)
 
     def _append_optimize_op(self, block, param_and_grad):
         if not isinstance(block, framework.Block):
@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
                                              param_and_grad[0])
         mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
                                                 param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
         rmsprop_op = block.append_op(
             type=self.type,
             inputs={
@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
                 "Grad": param_and_grad[1],
                 "Moment": momentum_acc,
                 "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
                 "LearningRate": self._create_param_lr(param_and_grad),
             },
             outputs={
                 "ParamOut": param_and_grad[0],
                 "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
             },
             attrs={
                 "epsilon": self._epsilon,
                 "decay": self._rho,
-                "momentum": self._momentum
+                "momentum": self._momentum,
+                "centered": self._centered
             })
 
         return rmsprop_op
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index be494a0d340c62fb35afbf97fba38eff08a965e6..2e15c224f662171bf0fee228bdd9d36189fbe499 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -16,7 +16,9 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import os
 import cifar10_small_test_set
 
 
@@ -89,7 +91,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     EPOCH_NUM = 1
 
@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, optimizer_func=optimizer_func, place=place)
+        train_func=train_program,
+        optimizer_func=optimizer_func,
+        place=place,
+        parallel=parallel)
 
     trainer.train(
         reader=train_reader,
@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['pixel', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
     save_path = "image_classification_resnet.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index f59f1c5af7b46baea8461b9ccff2b1f9f0460a74..2162989cfbf9d54bd862f2427bb4ccb00789ea4a 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -17,7 +17,9 @@ from __future__ import print_function
 import six
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import os
 import cifar10_small_test_set
 
 
@@ -69,7 +71,7 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     BATCH_SIZE = 128
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -94,7 +96,10 @@ def train(use_cuda, train_program, params_dirname):
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     if six.PY2:
         trainer.train(
@@ -114,10 +119,13 @@ def train(use_cuda, train_program, params_dirname):
             assert ("kid scope" in cpt.get_exception_message(ex))
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     # The input's dimension of conv should be 4-D or 5-D.
     # Use normilized image pixels as input data, which should be in the range
@@ -128,22 +136,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results)
 
 
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def main(use_cuda, parallel):
     save_path = "image_classification_vgg.inference.model"
 
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
     for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 187bef1b0c1a614fbca88ef22097831d7bd5cd7f..a5adf68158526b628deba3fc7ca6856eb7c9cded 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -64,14 +64,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
-        parallel=True)
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=core.is_compiled_with_cuda())
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index 66cb07dd47d148c1b47c94351da92159b4864bf3..f318964346532b9947ed1c143fa7a69b167dc3f8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
 import six
 import sys
@@ -51,11 +52,14 @@ def optimizer_func():
     return fluid.optimizer.Adam(learning_rate=0.001)
 
 
-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, parallel):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
@@ -98,11 +102,14 @@ def train(use_cuda, train_program, params_dirname):
             assert ("kid scope" in cpt.get_exception_message(ex))
 
 
-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -113,20 +120,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
     print("infer results: ", results[0])
 
 
-def main(use_cuda):
+def main(use_cuda, parallel):
     params_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
     train(
         use_cuda=use_cuda,
         train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
     infer(
         use_cuda=use_cuda,
         inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
 
 
 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=False)
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index ec0bf3ff8d64345111537780aaa5367ed0e1f8ff..e2564763d19d180f7c6933429dddf58c77be7bb8 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
             gt_box = layers.data(
                 name='gt_box', shape=[4], lod_level=1, dtype='float32')
 
-            predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign(
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
                 loc=loc,
                 scores=scores,
                 anchor_box=anchor_box,
@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
                 rpn_positive_overlap=0.7,
                 rpn_negative_overlap=0.3)
 
-            self.assertIsNotNone(predicted_scores)
-            self.assertIsNotNone(predicted_location)
-            self.assertIsNotNone(target_label)
-            self.assertIsNotNone(target_bbox)
-            assert predicted_scores.shape[1] == 2
-            assert predicted_location.shape[1] == 4
-            assert predicted_location.shape[1] == target_bbox.shape[1]
-
-        print(str(program))
+            self.assertIsNotNone(pred_scores)
+            self.assertIsNotNone(pred_loc)
+            self.assertIsNotNone(tgt_lbl)
+            self.assertIsNotNone(tgt_bbox)
+            assert pred_scores.shape[1] == 1
+            assert pred_loc.shape[1] == 4
+            assert pred_loc.shape[1] == tgt_bbox.shape[1]
 
 
 class TestGenerateProposals(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 7abfa0a4be0dec9fe251704e22dfef1f932e7c5b..e3db316698398ff693157d583ad1410d10dcf81d 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
 from paddle.compat import long_type
 
 import hashlib
@@ -315,8 +316,9 @@ def pad_batch_data(insts,
     """
     return_list = []
     max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
-                       [len(inst) for inst in insts]) if return_num_token else 0
+    num_token = six.moves.reduce(
+        lambda x, y: x + y,
+        [len(inst) for inst in insts]) if return_num_token else 0
     # Any token included in dict can be used to pad, since the paddings' loss
     # will be masked out by weights and make no effect on parameter gradients.
     inst_data = np.array(
@@ -328,7 +330,7 @@ def pad_batch_data(insts,
         return_list += [inst_weight.astype("float32").reshape([-1, 1])]
     else:  # position data
         inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
             for inst in insts
         ])
         return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
         return_num_token=True)
 
     data_input_dict = dict(
-        zip(data_input_names, [
-            src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
-            trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-        ]))
+        list(
+            zip(data_input_names, [
+                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
+                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+            ])))
     return data_input_dict, np.asarray([num_token], dtype="float32")
 
 
@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                         np.log(TrainTaskConfig.label_smooth_eps / (
                             ModelHyperParams.trg_vocab_size - 1) + 1e-20))
     init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
         pass_start_time = time.time()
         for batch_id, data in enumerate(train_data()):
             if batch_id >= 5:
@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                     ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                     ModelHyperParams.d_model)
                 total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                feed_kv_pairs = list(data_input_dict.items())
                 if TrainTaskConfig.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                         lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                 feed_list.append(dict(feed_kv_pairs))
 
                 if not init:
@@ -873,6 +876,7 @@ class DataReader(object):
 
             f = tarfile.open(fpaths[0], "r")
             for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
                 fields = line.strip("\n").split(self._field_delimiter)
                 if (not self._only_src and len(fields) == 2) or (
                         self._only_src and len(fields) == 1):
@@ -882,8 +886,9 @@ class DataReader(object):
                 if not os.path.isfile(fpath):
                     raise IOError("Invalid file: %s" % fpath)
 
-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                     for line in f:
+                        line = cpt.to_text(line)
                         fields = line.strip("\n").split(self._field_delimiter)
                         if (not self._only_src and len(fields) == 2) or (
                                 self._only_src and len(fields) == 1):
@@ -892,8 +897,9 @@ class DataReader(object):
     @staticmethod
     def load_dict(dict_path, reverse=False):
         word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
             for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
                 if reverse:
                     word_dict[idx] = line.strip("\n")
                 else:
@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
         # size of the input as the output dimension size.
         return layers.reshape(
             x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))
 
     def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
         """
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 20f1a37a426e9697048d636bf738c9056213e5f6..56a242b996f67aa4b9c858ab8aaeb1c1cd3bcf60 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
         outs, _ = self._calc_output(place)
         return outs
 
-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):
 
         program = Program()
         block = program.global_block()
@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
         # if not, fill the fetch_list by the user configured outputs in test.
         if len(fetch_list) == 0:
             for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
                             return_numpy=False)
         return outs, fetch_list
 
-    def check_output_with_place(self, place, atol):
-        outs, fetch_list = self._calc_output(place)
+    def check_output_with_place(self,
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue
 
             def find_actual(target_name, fetch_list):
                 found = [
@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
                         if isinstance(expect, tuple) else expect
                     self.assertTrue(
                         np.allclose(
-                            actual_t, expect_t, atol=atol),
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                         "Output (" + sub_out_name + ") has diff at " +
                         str(place))
                     if isinstance(expect, tuple):
@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
                 self.assertTrue(
                     np.allclose(
-                        actual_t, expect_t, atol=atol),
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t))
@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
             places.append(core.CUDAPlace(0))
         return places
 
-    def check_output(self, atol=1e-5):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
         places = self._get_places()
         for place in places:
-            self.check_output_with_place(place, atol)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)
 
     def check_output_customized(self, checker):
         places = self._get_places()
diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py
index 5393a17e674a3cad6d705a1ff7a45320e644af94..1de4a9d016a177944253d12094722d3a05614be2 100644
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
         pred = np.random.random((128, 2)).astype("float32")
         labels = np.random.randint(0, 2, (128, 1))
         num_thresholds = 200
-        tp = np.zeros((num_thresholds, )).astype("int64")
-        tn = np.zeros((num_thresholds, )).astype("int64")
-        fp = np.zeros((num_thresholds, )).astype("int64")
-        fn = np.zeros((num_thresholds, )).astype("int64")
+
+        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
+        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")
 
         self.inputs = {
             'Predict': pred,
             'Label': labels,
-            'TP': tp,
-            'TN': tn,
-            'FP': fp,
-            'FN': fn
+            "StatPos": stat_pos,
+            "StatNeg": stat_neg
         }
         self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
 
@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
         python_auc.update(pred, labels)
 
         self.outputs = {
-            'AUC': python_auc.eval(),
-            'TPOut': python_auc.tp_list,
-            'FNOut': python_auc.fn_list,
-            'TNOut': python_auc.tn_list,
-            'FPOut': python_auc.fp_list
+            'AUC': np.array(python_auc.eval()),
+            'BatchAUC': np.array(python_auc.eval()),
+            'StatPosOut': np.array(python_auc._stat_pos),
+            'StatNegOut': np.array(python_auc._stat_neg)
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index 1d9ab44ed447468fb8383c52747d14970ae27ced..b85501ef6b80d1f5004aa0dd08c3123d3bda48a5 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
         # 2 optimize for table adam
         # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
         self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "adam", "scale", "scale"])
+                         ["sum", "scale", "adam", "scale", "scale"])
 
         trainer, _ = self.get_trainer()
         self.assertEqual(len(trainer.blocks), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index cc0494774a5f2f24faaae65f193fc3ff9270d9ac..820ad4af88e9dc49cbe57ac182e1ba0402725f3d 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -21,28 +21,41 @@ from op_test import OpTest
 
 class TestFakeQuantizeOp(OpTest):
     def setUp(self):
-        self.op_type = "fake_quantize"
+        self.op_type = "fake_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScale': np.array(scale).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
         self.attrs = {
-            'bit_length': 8,
-            'quantize_type': 'abs_max',
-            'window_size': 10000
+            'bit_length': int(5),
+            'window_size': int(1),
+            'is_test': False
         }
         self.inputs = {
-            'X': np.random.random((10, 10)).astype("float32"),
-            'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'InCurrentIter': np.zeros(1).astype("float32"),
-            'InMovingScale': np.zeros(1).astype("float32")
-        }
-        self.scale = {
-            'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': np.zeros(1).astype("float32")
         }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
         self.outputs = {
-            'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
+            'Out': np.round(self.inputs['X'] / scale * (
                 (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'OutMovingScale':
-            np.array([self.scale['abs_max']]).astype("float32"),
-            'OutCurrentIter': np.zeros(1).astype("float32")
+            'OutScale': scale,
+            'OutScales': out_scales,
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
index 17b01e03124e8007c51107b414c628d4bfc49c79..effa2a148eef8b0047b12c676803abb2871e8118 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -22,14 +22,17 @@ from op_test import OpTest
 
 class TestFlattenOp(OpTest):
     def setUp(self):
-        self.op_type = "flatten"
+        self.op_type = "flatten2"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 764f83b534c8a183dbf21511f0b05741c13c9528..36ebc8fb6ea9efdcd1807f5c8917ab1428b3381e 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -37,7 +37,7 @@ def fusion_gru(
                h0,
                wh,
                np.zeros(
-                   (1, wh.shape[1]), dtype='float64'),
+                   (1, wh.shape[1]), dtype='float32'),
                is_reverse,
                act_state,
                act_gate)
@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
         T = sum(self.lod[0])
         N = len(self.lod[0])
 
-        x = np.random.rand(T, self.M).astype('float64')
-        wx = np.random.rand(self.M, 3 * self.D).astype('float64')
-        wh = np.random.rand(self.D, 3 * self.D).astype('float64')
+        x = np.random.rand(T, self.M).astype('float32')
+        wx = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh = np.random.rand(self.D, 3 * self.D).astype('float32')
         bias = np.random.rand(
-            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
-                (1, 3 * self.D), dtype='float64')
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
         h0 = np.random.rand(
-            N, self.D).astype('float64') if self.with_h0 else np.zeros(
-                (N, self.D), dtype='float64')
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')
 
         _, _, _, hidden = fusion_gru(
             x, self.lod, h0, wx, wh, bias, self.is_reverse,
@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output(atol=1e-8)
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
 
 
 class TestFusionGRUOpNoInitial(TestFusionGRUOp):
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 5805bdf461998e90611dec05b079cd55feda520d..4767e9433ea74d5da83867d646f2a63c9a092668 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
         self.act_cell = 'tanh'
         self.act_cand = 'tanh'
         self.use_peepholes = False
+        self.use_seq = False
         self.set_conf()
 
         T = sum(self.lod[0])
@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest):
         }
         self.attrs = {
             'use_peepholes': self.use_peepholes,
+            'use_seq': self.use_seq,
             'is_reverse': self.is_reverse,
             'gate_activation': self.act_gate,
             'cell_activation': self.act_cell,
@@ -114,7 +116,9 @@ class TestFusionLSTMOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output()
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()
 
 
 class TestFusionLSTMOpInit(TestFusionLSTMOp):
@@ -157,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
         self.D = 16
 
 
+class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.lod = [[3]]
+        self.D = 16
+
+
+class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
index ce766fffbce98a6a2cee4c508d6db85ee0163401..6dc101b6dad8813893c6a891da0e16f952bb4c2d 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
 
     dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
     dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / ex_w / weights[2]
-    dh = (np.log(gt_h / ex_h)) / ex_h / weights[3]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]
 
     targets = np.vstack([dx, dy, dw, dh]).transpose()
     return targets
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ecdf32524afb1357b192ce14674b7073972dee9f..bc4d364c74c6cb6b8f0df59e7ede77e6271f4b96 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_sequence_enumerate(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
+            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py
index 1e3e40d54a78045c8d8fdd9a3a3715107d1e7a80..48a6b0577b6787d2e1231fdcbe6d2c1bb46414ed 100644
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import six
 from op_test import OpTest
 
 
@@ -62,17 +63,20 @@ class PReluTest(OpTest):
 
 
 # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-#  class TestCase1(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "all"}
+if six.PY2:
 
-#  class TestCase2(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "channel"}
+    class TestCase1(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "all"}
+
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}
 
-#  class TestCase3(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "element"}
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 1de35dc35b0176b77eb2d9b25cd6ee4e645e56c3..0557593657e2e480a509902a07f25723b2c710b0 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -22,106 +22,39 @@ from op_test import OpTest
 
 class TestReshapeOp(OpTest):
     def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
 
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+    def init_data(self):
+        self.ori_shape = (2, 25)
+        self.new_shape = (5, 10)
+        self.infered_shape = (5, 10)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 10)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
 
 
 class TestReshapeOpWithInputShape(OpTest):
@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
         new_shape = (0, -1, 5)
         actual_shape = (2, 3, 5)
 
-        self.op_type = "reshape"
+        self.op_type = "reshape2"
         self.inputs = {
             "X": np.random.random(ori_shape).astype("float32"),
             "Shape": np.array(
                 actual_shape, dtype="int32")
         }
         self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index 3d4623c74d9a307b12ab6d72ad0b4d2dae938720..70848e4e2239e2be160bb0c1a28a5aecd01a87dc 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -15,90 +15,164 @@
 from __future__ import print_function
 
 import unittest
+
 import numpy as np
-from op_test import OpTest
-
-
-class TestRmspropOp1(OpTest):
-    ''' Test RMSProp with explicit inputs
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1e-6
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRmspropOp2(OpTest):
-    '''Test RMSProp with default values for attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1.0e-10
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestBase(unittest.TestCase):
+    def setup(self, centered, epsilon=1e-6):
+        np.random.seed(5)  # fix seed
+
+        self.param_name = "param"
+        self.param = np.random.random((123, 321)).astype("float32")
+
+        self.mean_square_name = "mean_square"
+        self.mean_square = np.random.random((123, 321)).astype("float32")
+
+        self.mean_grad_name = "mean_grad"
+        self.mean_grad = np.random.random((123, 321)).astype("float32")
+
+        self.lr_name = "lr"
+        self.learning_rate = np.array([0.01]).astype("float32")
+
+        self.grad_name = "grad"
+        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.moment_name = "moment"
+        self.moment = np.zeros((123, 321)).astype("float32")
+
+        self.epsilon = epsilon
+        self.decay = 0.9
+        self.momentum = 0.0
+        self.centered = centered
+
+        self.ms_out = self.decay * self.mean_square + (1 - self.decay
+                                                       ) * self.grad * self.grad
+        if centered:
+            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
+                                                         ) * self.grad
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
+        else:
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
+
+        self.param_out = self.param - self.moment_out
+
+    def check(self,
+              actual_t,
+              expect_t,
+              place,
+              out_name,
+              atol=1e-5,
+              equal_nan=False):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self, place, centered, epsilon):
+        self.setup(centered, epsilon)
+        scope = core.Scope()
+
+        # create and initialize Param Variable
+        param = scope.var(self.param_name).get_tensor()
+        param.set(self.param, place)
+
+        mean_square = scope.var(self.mean_square_name).get_tensor()
+        mean_square.set(self.mean_square, place)
+
+        lr = scope.var(self.lr_name).get_tensor()
+        lr.set(self.learning_rate, place)
+
+        grad = scope.var(self.grad_name).get_tensor()
+        grad.set(self.grad, place)
+
+        moment = scope.var(self.moment_name).get_tensor()
+        moment.set(self.moment, place)
+
+        # create and run sgd operator
+
+        if self.centered:
+            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+            mean_grad.set(self.mean_grad, place)
+
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                MeanGrad=self.mean_grad_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                MeanGradOut=self.mean_grad_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=True)
+        else:
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=False)
+
+        rmsprop_op.run(scope, place)
+
+        atol = 1e-5
+        equal_nan = False
+
+        if self.centered:
+            atol = 1e-3
+            equal_nan = True
+
+        self.check(
+            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+        self.check(
+            np.array(moment),
+            self.moment_out,
+            place,
+            self.moment_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        self.check(
+            np.array(param),
+            self.param_out,
+            place,
+            self.param_name,
+            atol=atol,
+            equal_nan=equal_nan)
+
+        if self.centered:
+            self.check(
+                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+
+    def test_rmsprop(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, False, 1e-6)
+            self.check_with_place(place, False, 1e-10)
+            self.check_with_place(place, True, 1e-6)
+            self.check_with_place(place, True, 1e-10)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index ed7f467835f32242a9650f226b4a5ad9d6d87af4..ad4cd2e803bfae4c3fbc04503331b9a786b25d17 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):
 
         for i in range(self.rois_num):
             roi = self.rois[i]
-            roi_batch_id = roi[0]
+            roi_batch_id = int(roi[0])
             roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
             roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
             roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
                 roi = [bno, x1, y1, x2, y2]
                 rois.append(roi)
         self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("int64")
+        self.rois = np.array(rois).astype("float32")
 
     def setUp(self):
         self.op_type = "roi_pool"
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index 08c462d9036cacab81dab7c9ea16664c9159479f..bd548009b3ada9512e4b5f7d7b61b67b0717a39b 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -18,12 +18,17 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
+from test_anchor_generator_op import anchor_generator_in_python
+from test_generate_proposal_labels import _generate_groundtruth
+from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
 
 
-def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
-                      rpn_negative_overlap, fg_fraction):
-    iou = np.transpose(iou)
+def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
+                      rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+    iou = np.transpose(gt_anchor_iou)
     anchor_to_gt_max = iou.max(axis=1)
+    anchor_to_gt_argmax = iou.argmax(axis=1)
+
     gt_to_anchor_argmax = iou.argmax(axis=0)
     gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
     anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
 
     num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
     bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    tgt_lbl[bg_inds] = 0
     if len(bg_inds) > num_bg:
         enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
         tgt_lbl[enable_inds] = 0
     bg_inds = np.where(tgt_lbl == 0)[0]
+    tgt_lbl[bg_inds] = 0
 
     loc_index = fg_inds
     score_index = np.hstack((fg_inds, bg_inds))
     tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
-    return loc_index, score_index, tgt_lbl
+
+    gt_inds = anchor_to_gt_argmax[fg_inds]
+
+    return loc_index, score_index, tgt_lbl, gt_inds
+
+
+def get_anchor(n, c, h, w):
+    input_feat = np.random.random((n, c, h, w)).astype('float32')
+    anchors, _ = anchor_generator_in_python(
+        input_feat=input_feat,
+        anchor_sizes=[32., 64.],
+        aspect_ratios=[0.5, 1.0],
+        variances=[1.0, 1.0, 1.0, 1.0],
+        stride=[16.0, 16.0],
+        offset=0.5)
+    return anchors
+
+
+def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
+             rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+
+    loc_indexes = []
+    score_indexes = []
+    tmp_tgt_labels = []
+    tgt_bboxes = []
+    anchor_num = anchor.shape[0]
+
+    batch_size = len(lod) - 1
+    for i in range(batch_size):
+        b, e = lod[i], lod[i + 1]
+        iou_slice = iou[b:e, :]
+        bboxes_slice = gt_boxes[b:e, :]
+
+        loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
+            iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, fg_fraction)
+
+        fg_bboxes = bboxes_slice[gt_inds]
+        fg_anchors = anchor[loc_idx]
+        box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_idx
+            score_indexes = score_idx
+            tmp_tgt_labels = tgt_lbl
+            tgt_bboxes = box_deltas
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_idx + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_idx + i * anchor_num])
+            tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+
+    tgt_labels = tmp_tgt_labels[score_indexes]
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels
 
 
 class TestRpnTargetAssignOp(OpTest):
     def setUp(self):
-        iou = np.random.random((10, 8)).astype("float32")
-        self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
-        self.attrs = {
-            'rpn_batch_size_per_im': 256,
-            'rpn_positive_overlap': 0.95,
-            'rpn_negative_overlap': 0.3,
-            'fg_fraction': 0.25,
-            'fix_seed': True
-        }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
-                                                            0.25)
-        self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
-        }
+        n, c, h, w = 2, 4, 14, 14
+        anchor = get_anchor(n, c, h, w)
+        gt_num = 10
+        anchor = anchor.reshape(-1, 4)
+        anchor_num = anchor.shape[0]
 
-    def test_check_output(self):
-        self.check_output()
+        im_shapes = [[64, 64], [64, 64]]
+        gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
+        bbox = np.vstack([v['boxes'] for v in gt_box])
 
+        iou = _bbox_overlaps(bbox, anchor)
+
+        anchor = anchor.astype('float32')
+        bbox = bbox.astype('float32')
+        iou = iou.astype('float32')
+
+        loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
+            anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
 
-class TestRpnTargetAssignOp2(OpTest):
-    def setUp(self):
-        iou = np.random.random((10, 20)).astype("float32")
         self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
+        self.inputs = {
+            'Anchor': anchor,
+            'GtBox': (bbox, [[4, 4]]),
+            'DistMat': (iou, [[4, 4]]),
+        }
         self.attrs = {
-            'rpn_batch_size_per_im': 128,
-            'rpn_positive_overlap': 0.5,
-            'rpn_negative_overlap': 0.5,
-            'fg_fraction': 0.5,
+            'rpn_batch_size_per_im': 25600,
+            'rpn_positive_overlap': 0.95,
+            'rpn_negative_overlap': 0.03,
+            'fg_fraction': 0.25,
             'fix_seed': True
         }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
-                                                            0.5)
         self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
+            'LocationIndex': loc_index.astype('int32'),
+            'ScoreIndex': score_index.astype('int32'),
+            'TargetBBox': tgt_bbox.astype('float32'),
+            'TargetLabel': tgt_lbl.astype('int64'),
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
index 708265b4576809b1f4157d54989c6138c6e5a2b0..674ef2ddf44edb4246c9d952cb75b36fe3d6ddc8 100644
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest):
         self.op_type = "sampling_id"
         self.use_mkldnn = False
         self.init_kernel_type()
-        self.X = np.random.random((8, 4)).astype('float32')
+        self.X = np.random.random((100, 10)).astype('float32')
         self.inputs = {"X": self.X}
-        self.Y = np.random.random(8).astype('float32')
+        self.Y = np.random.random(100).astype('int64')
         self.outputs = {'Out': self.Y}
         self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}
 
@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest):
         y1 = self.out
         self.check_output_customized(self.verify_output)
         y2 = self.out
+
+        # check dtype
+        assert y1.dtype == np.int64
+        assert y2.dtype == np.int64
+
+        # check output is index ids of inputs
+        inputs_ids = np.arange(self.X.shape[1])
+        assert np.isin(y1, inputs_ids).all()
+        assert np.isin(y2, inputs_ids).all()
+
         self.assertTrue(np.array_equal(y1, y2))
         self.assertEqual(len(y1), len(self.Y))
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9814ec0a15e1803b356f300d378c31e57ba36c09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
+    lod0 = [0]
+    for i in range(0, len(in_lod[0])):
+        lod0.append(lod0[i] + in_lod[0][i])
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        for idx in range(lod0[i], lod0[i + 1]):
+            single_seq = []
+            for word_idx in range(win_size):
+                word_pos = idx + word_idx
+                dat = input_seq[word_pos] if word_pos < lod0[i+1] \
+                    else pad_value
+                single_seq.append(dat)
+            out_seq.append(single_seq)
+    return out_seq
+
+
+class TestSequenceEnumerateOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_enumerate"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int64")
+
+
+class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 30
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 5
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index 2be8e24a0fae6945351eb767ac924d7ca70848ab..204a4bb40196bd1fc2f5861aa31cf9560ea4d349 100644
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -23,14 +23,17 @@ from op_test import OpTest
 # Correct: General.
 class TestSqueezeOp(OpTest):
     def setUp(self):
-        self.op_type = "squeeze"
+        self.op_type = "squeeze2"
         self.init_test_case()
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 0853f80b82030679d140f7fabdd42557c2374599..c30da2389d50d3b6bdf1f911aaed6ed71f274153 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -22,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
         self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])
 
     def initTestCase(self):
         self.shape = (3, 4)
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index a324438ba5a3c3b57fd956bd11189ef7d50267e2..14dd2bb06f9a18d0b15a4aee4e9e6bfdf8c41206 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,13 +24,16 @@ from op_test import OpTest
 class TestUnsqueezeOp(OpTest):
     def setUp(self):
         self.init_test_case()
-        self.op_type = "unsqueeze"
+        self.op_type = "unsqueeze2"
         self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
         self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
index f0fafaa84a73d641ff6ceb74def6addaea759516..a83aa0f11eed9bfc1674d8d75dcfacc297f056b0 100644
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):
 
     indent += 1
     # sort all vars
-    all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
     for var in all_vars:
         print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))
 
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index a6266a7b0c9ac40eac7b2823fc7ddf38f55357a9..d4d218d547a394a56c040ade2a9ba703b691b86b 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
             input_deps = grad_name_to_send_dummy_out.values()
             program.global_block().append_op(
                 type="send_barrier",
-                inputs={"X": input_deps},
+                inputs={"X": list(input_deps)},
                 outputs={"Out": send_barrier_out},
                 attrs={
                     "endpoints": pserver_endpoints,
@@ -401,7 +401,7 @@ class DistributeTranspiler(object):
 
         Args:
             recv_vars (list): Variable list to recv for current trainer_id
-            eplist (list): A list of strings indicating 
+            eplist (list): A list of strings indicating
 
         Returns:
             Program: trainer side startup program.
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
             if len(splited_var) <= 1:
                 continue
             # NOTE: if enable memory optimization, origin vars maybe removed.
-            if startup_program.global_block().vars.has_key(varname):
+            if varname in startup_program.global_block().vars:
                 orig_param = startup_program.global_block().vars[varname]
             else:
                 origin_param_var = self.origin_program.global_block().vars[
@@ -690,7 +690,7 @@ class DistributeTranspiler(object):
 
         Args:
             endpoint (str): current pserver endpoint.
-        
+
         Returns:
             tuple: (main_program, startup_program), of type "Program"
         """
@@ -713,7 +713,7 @@ class DistributeTranspiler(object):
             endpoint (str): current pserver endpoint.
             pserver_program (Program): deprecated, call get_pserver_program first.
             startup_program (Program): deprecated, should pass startup_program
-                when initalizing 
+                when initalizing
 
         Returns:
             Program: parameter server side startup program.
@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
             self.table_name]
 
         zero_dim = int(
-            math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
+            math.ceil(origin_param_var.shape[0] / float(
+                len(self.pserver_endpoints))))
         table_shape = list(origin_param_var.shape)
         table_shape[0] = zero_dim
 
@@ -1390,13 +1391,11 @@ class DistributeTranspiler(object):
                 inputs={"X": vars2merge},
                 outputs={"Out": merged_var},
                 attrs={"use_mkldnn": False})
-            # TODO(panyx0718): What if it's SELECTED_ROWS.
-            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                optimize_block.append_op(
-                    type="scale",
-                    inputs={"X": merged_var},
-                    outputs={"Out": merged_var},
-                    attrs={"scale": 1.0 / float(self.trainer_num)})
+            optimize_block.append_op(
+                type="scale",
+                inputs={"X": merged_var},
+                outputs={"Out": merged_var},
+                attrs={"scale": 1.0 / float(self.trainer_num)})
         return merged_var
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,