Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_python35_CI_random_fail

3420c949 · minqiyang · d4accfa9 · b8057515 · 3420c949 · 3420c949
239 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
    tar -xz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr

--- a/README.md
+++ b/README.md
@@ -76,33 +76,26 @@ pip install paddlepaddle-gpu==0.14.0.post85

 ## Installation

-It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
-before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
+It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/install/install_doc.html) on our website.

 ## Documentation

-We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
-[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
+We provide [English](http://paddlepaddle.org/documentation/docs/en/0.14.0/getstarted/index_en.html) and
+[Chinese](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/beginners_guide/index.html) documentation.

- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
+- [Deep Learning 101](https://github.com/PaddlePaddle/book)

  You might want to start from this online interactive book that can run in a Jupyter Notebook.

- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
+- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/user_guides/howto/training/cluster_howto.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
-
-   You can also run distributed training jobs on Kubernetes clusters.
-
- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
+- [Python API](http://paddlepaddle.org/documentation/api/zh/0.14.0/fluid.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/0.14.0/new_docs/advanced_usage/development/contribute_to_paddle.html)

   We appreciate your contributions!


--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -169,14 +169,19 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)

 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
+if (NOT WIN32) # windows msvc2015 support c++11 natively. 
+# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+endif(NOT WIN32)
+
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")

+if (NOT WIN32)
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
 elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
@@ -187,6 +192,13 @@ elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
    # nvcc 9 does not support -Os. Use Release flags instead
    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
 endif()
+else(NOT WIN32)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+  list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG")
+else()
+  message(FATAL "Windows only support Release build now. Please set visual studio build type to Release, x64 build.")
+endif()
+endif(NOT WIN32)

 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
    #    checkout and clean other dirs under third_party
    # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -128,16 +128,13 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 if (NOT WIN32)
-copy(framework_lib DEPS framework_py_proto 
-  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
-)
-else()
-copy(framework_lib
+set(framework_lib_deps framework_py_proto)
+endif(NOT WIN32)
+copy(framework_lib DEPS ${framework_lib_deps}
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
+       ${src_dir}/${module}/ir/*.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module}/ir
 )
-endif(NOT WIN32)

 set(module "memory")
 copy(memory_lib
@@ -161,7 +158,8 @@ set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
  SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
+       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )

 set(module "platform")

--- a/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_anakin.rst
-服务器端部署 - Anakin
-#####################
+Anakin - 服务器端加速引擎
+#######################


 使用文档

--- a/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/index_native.rst
-服务器端部署 - 原生引擎
-#######################
-
-..  toctree::
-    :maxdepth: 2
-
-    build_and_install_lib_cn.rst
-    native_infer.rst
--- a/doc/fluid/new_docs/advanced_usage/index.rst
+++ b/doc/fluid/new_docs/advanced_usage/index.rst
@@ -10,7 +10,6 @@
 ..  toctree::
    :maxdepth: 2

-    deploy/index_native.rst
    deploy/index_anakin.rst
    deploy/index_mobile.rst
    development/contribute_to_paddle.md

--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/.gitignore
+*.pyc
+train.log
+output
+data/cifar-10-batches-py/
+data/cifar-10-python.tar.gz
+data/*.txt
+data/*.list
+data/mean.meta
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/README.cn.md
@@ -21,7 +21,7 @@
 图像分类包括通用图像分类、细粒度图像分类等。图1展示了通用图像分类效果，即模型可以正确识别图像上的主要物体。

 <p align="center">
-<img src="image/dog_cat.png "  width="350" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/dog_cat.png?raw=true"  width="350" ><br/>
 图1. 通用图像分类展示
 </p>

@@ -30,7 +30,7 @@


 <p align="center">
-<img src="image/flowers.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/flowers.png?raw=true" width="400" ><br/>
 图2. 细粒度图像分类展示
 </p>

@@ -38,7 +38,7 @@
 一个好的模型既要对不同类别识别正确，同时也应该能够对不同视角、光照、背景、变形或部分遮挡的图像正确识别(这里我们统一称作图像扰动)。图3展示了一些图像的扰动，较好的模型会像聪明的人类一样能够正确识别。

 <p align="center">
-<img src="image/variations.png" width="550" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/variations.png?raw=true" width="550" ><br/>
 图3. 扰动图片展示[22]
 </p>

@@ -61,7 +61,7 @@
 Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得了历史性的突破，效果大幅度超越传统方法，获得了ILSVRC2012冠军，该模型被称作AlexNet。这也是首次将深度学习用于大规模图像分类中。从AlexNet之后，涌现了一系列CNN模型，不断地在ImageNet上刷新成绩，如图4展示。随着模型变得越来越深以及精妙的结构设计，Top-5的错误率也越来越低，降到了3.5%附近。而在同样的ImageNet数据集上，人眼的辨识错误率大概在5.1%，也就是目前的深度学习模型的识别能力已经超过了人眼。

 <p align="center">
-<img src="image/ilsvrc.png" width="500" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/ilsvrc.png?raw=true" width="500" ><br/>
 图4. ILSVRC图像分类Top-5错误率
 </p>

@@ -70,7 +70,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 传统CNN包含卷积层、全连接层等组件，并采用softmax多类别分类器和多类交叉熵损失函数，一个典型的卷积神经网络如图5所示，我们先介绍用来构造CNN的常见组件。

 <p align="center">
-<img src="image/lenet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/lenet.png?raw=true"><br/>
 图5. CNN网络示例[20]
 </p>

@@ -89,7 +89,7 @@ Alex Krizhevsky在2012年ILSVRC提出的CNN模型 \[[9](#参考文献)\] 取得
 牛津大学VGG(Visual Geometry Group)组在2014年ILSVRC提出的模型被称作VGG模型 \[[11](#参考文献)\] 。该模型相比以往模型进一步加宽和加深了网络结构，它的核心是五组卷积操作，每两组之间做Max-Pooling空间降维。同一组内采用多次连续的3X3卷积，卷积核的数目由较浅组的64增多到最深组的512，同一组内的卷积核数目是一样的。卷积之后接两层全连接层，之后是分类层。由于每组内卷积层的不同，有11、13、16、19层这几种模型，下图展示一个16层的网络结构。VGG模型结构相对简洁，提出之后也有很多文章基于此模型进行研究，如在ImageNet上首次公开超过人眼识别的模型\[[19](#参考文献)\]就是借鉴VGG模型的结构。

 <p align="center">
-<img src="image/vgg16.png" width="750" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/vgg16.png?raw=true" width="750" ><br/>
 图6. 基于ImageNet的VGG16模型
 </p>

@@ -106,7 +106,7 @@ NIN模型主要有两个特点：
 Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接。这种设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积后，通道数会越来越大，导致参数和计算量也随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。

 <p align="center">
-<img src="image/inception.png" width="800" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/inception.png?raw=ture" width="800" ><br/>
 图7. Inception模块
 </p>

@@ -115,7 +115,7 @@ GoogleNet由多组Inception模块堆积而成。另外，在网络最后也没
 GoogleNet整体网络结构如图8所示，总共22层网络：开始由3层普通的卷积组成；接下来由三组子网络组成，第一组子网络包含2个Inception模块，第二组包含5个Inception模块，第三组包含2个Inception模块；然后接均值池化层、全连接层。

 <p align="center">
-<img src="image/googlenet.jpeg" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/googlenet.jpeg?raw=true" ><br/>
 图8. GoogleNet[12]
 </p>

@@ -130,14 +130,14 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 残差模块如图9所示，左边是基本模块连接方式，由两个输出通道数相同的3x3卷积组成。右边是瓶颈模块(Bottleneck)连接方式，之所以称为瓶颈，是因为上面的1x1卷积用来降维(图示例即256->64)，下面的1x1卷积用来升维(图示例即64->256)，这样中间3x3卷积的输入和输出通道数都较小(图示例即64->64)。

 <p align="center">
-<img src="image/resnet_block.jpg" width="400"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet_block.jpg?raw=true" width="400"><br/>
 图9. 残差模块
 </p>

 图10展示了50、101、152层网络连接示意图，使用的是瓶颈模块。这三个模型的区别在于每组中残差模块的重复次数不同(见图右上角)。ResNet训练收敛较快，成功的训练了上百乃至近千层的卷积神经网络。

 <p align="center">
-<img src="image/resnet.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/resnet.png?raw=true"><br/>
 图10. 基于ImageNet的ResNet模型
 </p>

@@ -149,7 +149,7 @@ ResNet(Residual Network) \[[15](#参考文献)\] 是2015年ImageNet图像分类
 由于ImageNet数据集较大，下载和训练较慢，为了方便大家学习，我们使用[CIFAR10](<https://www.cs.toronto.edu/~kriz/cifar.html>)数据集。CIFAR10数据集包含60,000张32x32的彩色图片，10个类别，每个类包含6,000张。其中50,000张图片作为训练集，10000张作为测试集。图11从每个类别中随机抽取了10张图片，展示了所有的类别。

 <p align="center">
-<img src="image/cifar.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/cifar.png?raw=true" width="350"><br/>
 图11. CIFAR10数据集[21]
 </p>

@@ -377,7 +377,7 @@ test_reader = paddle.batch(
 `event_handler_plot`可以用来利用回调数据来打点画图:

 <p align="center">
-<img src="image/train_and_test.png" width="350"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/train_and_test.png?raw=true" width="350"><br/>
 图12. 训练结果
 </p>

@@ -469,7 +469,7 @@ Test with Pass 0, Loss 1.1, Acc 0.6
 图13是训练的分类错误率曲线图，运行到第200个pass后基本收敛，最终得到测试集上分类错误率为8.54%。

 <p align="center">
-<img src="image/plot.png" width="400" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/03.image_classification/image/plot.png?raw=true" width="400" ><br/>
 图13. CIFAR10数据集上VGG模型的分类错误率
 </p>


--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/cifar.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/dog_cat.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/fea_conv0.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/flowers.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/googlenet.jpeg
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/ilsvrc.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/inception_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/lenet_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/plot_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/resnet_block.jpg
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/train_and_test.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/variations_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/image_classification/image/vgg16.png
--- a/doc/fluid/new_docs/beginners_guide/basics/index.rst
+++ b/doc/fluid/new_docs/beginners_guide/basics/index.rst
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/.gitignore
+data/train.list
+data/test.*
+data/conll05st-release.tar.gz
+data/conll05st-release
+data/predicate_dict
+data/label_dict
+data/word_dict
+data/emb
+data/feature
+output
+predict.res
+train.log
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/README.cn.md
@@ -21,7 +21,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 5. 对第4步的结果，通过多分类得到论元的语义角色标签。可以看到，句法分析是基础，并且后续步骤常常会构造的一些人工特征，这些特征往往也来自句法分析。

 <div  align="center">
-<img src="image/dependency_parsing.png" width = "80%" align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/dependency_parsing.png?raw=true" width = "80%" align=center /><br>
 图1. 依存句法分析句法树示例
 </div>

@@ -30,7 +30,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 我们继续以上面的这句话为例，图1展示了BIO表示方法。

 <div  align="center">
-<img src="image/bio_example.png" width = "90%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bio_example.png?raw=true" width = "90%"  align=center /><br>
 图2. BIO标注方法示例
 </div>

@@ -53,7 +53,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 图3是最终得到的栈式循环神经网络结构示意图。

 <p align="center">  
-<img src="./image/stacked_lstm.png" width = "40%"  align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/stacked_lstm.png?raw=true" width = "40%"  align=center><br>
 图3. 基于LSTM的栈式循环神经网络结构示意图
 </p>

@@ -64,7 +64,7 @@ $$\mbox{[小明]}_{\mbox{Agent}}\mbox{[昨天]}_{\mbox{Time}}\mbox{[晚上]}_\mb
 为了克服这一缺陷，我们可以设计一种双向循环网络单元，它的思想简单且直接：对上一节的栈式循环神经网络进行一个小小的修改，堆叠多个LSTM单元，让每一层LSTM单元分别以：正向、反向、正向 …… 的顺序学习上一层的输出序列。于是，从第2层开始，$t$时刻我们的LSTM单元便总是可以看到历史和未来的信息。图4是基于LSTM的双向循环神经网络结构示意图。

 <p align="center">  
-<img src="./image/bidirectional_stacked_lstm.png" width = "60%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/bidirectional_stacked_lstm.png?raw=true" width = "60%" align=center><br>
 图4. 基于LSTM的双向循环神经网络结构示意图
 </p>

@@ -79,7 +79,7 @@ CRF是一种概率化结构模型，可以看作是一个概率无向图模型
 序列标注任务只需要考虑输入和输出都是一个线性序列，并且由于我们只是将输入序列作为条件，不做任何条件独立假设，因此输入序列的元素之间并不存在图结构。综上，在序列标注任务中使用的是如图5所示的定义在链式图上的CRF，称之为线性链条件随机场（Linear Chain Conditional Random Field）。

 <p align="center">  
-<img src="./image/linear_chain_crf.png" width = "35%" align=center><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/linear_chain_crf.png?raw=true" width = "35%" align=center><br>
 图5. 序列标注任务中使用的线性链条件随机场
 </p>

@@ -123,7 +123,7 @@ $$\DeclareMathOperator*{\argmax}{arg\,max} L(\lambda, D) = - \text{log}\left(\pr
 4. CRF以第3步中LSTM学习到的特征为输入，以标记序列为监督信号，完成序列标注；

 <div  align="center">  
-<img src="image/db_lstm_network.png" width = "60%"  align=center /><br>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/image/db_lstm_network.png?raw=true" width = "60%"  align=center /><br>
 图6. SRL任务上的深层双向LSTM模型
 </div>


--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bidirectional_stacked_lstm_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/bio_example_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/db_lstm_network_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/dependency_parsing_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/linear_chain_crf.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm.png
--- a/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/label_semantic_roles/image/stacked_lstm_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/.gitignore
+data/wmt14
+data/pre-wmt14
+pretrained/wmt14_model
+gen.log
+gen_result
+train.log
+dataprovider_copy_1.py
+*.pyc
+multi-bleu.perl
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/README.cn.md
@@ -11,10 +11,10 @@
 为解决以上问题，统计机器翻译（Statistical Machine Translation, SMT）技术应运而生。在统计机器翻译技术中，转化规则是由机器自动从大规模的语料中学习得到的，而非我们人主动提供规则。因此，它克服了基于规则的翻译系统所面临的知识获取瓶颈的问题，但仍然存在许多挑战：1）人为设计许多特征（feature），但永远无法覆盖所有的语言现象；2）难以利用全局的特征；3）依赖于许多预处理环节，如词语对齐、分词或符号化（tokenization）、规则抽取、句法分析等，而每个环节的错误会逐步累积，对翻译的影响也越来越大。

 近年来，深度学习技术的发展为解决上述挑战提供了新的思路。将深度学习应用于机器翻译任务的方法大致分为两类：1）仍以统计机器翻译系统为框架，只是利用神经网络来改进其中的关键模块，如语言模型、调序模型等（见图1的左半部分）；2）不再以统计机器翻译系统为框架，而是直接用神经网络将源语言映射到目标语言，即端到端的神经网络机器翻译（End-to-End Neural Machine Translation, End-to-End NMT）（见图1的右半部分），简称为NMT模型。
-![nmt](./image/nmt.png)
-<p align="center">
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/nmt.png?raw=true" width = "400" align=center/><br/>
 图1. 基于神经网络的机器翻译系统
-</p>
+</div>

 本教程主要介绍NMT模型，以及如何用PaddlePaddle来训练一个NMT模型。

@@ -45,19 +45,22 @@

 具体来说，该双向循环神经网络分别在时间维以顺序和逆序——即前向（forward）和后向（backward）——依次处理输入序列，并将每个时间步RNN的输出拼接成为最终的输出层。这样每个时间步的输出节点，都包含了输入序列中当前时刻完整的过去和未来的上下文信息。下图展示的是一个按时间步展开的双向循环神经网络。该网络包含一个前向和一个后向RNN，其中有六个权重矩阵：输入到前向隐层和后向隐层的权重矩阵（`$W_1, W_3$`），隐层到隐层自己的权重矩阵（`$W_2,W_5$`），前向隐层和后向隐层到输出层的权重矩阵（`$W_4, W_6$`）。注意，该网络的前向隐层和后向隐层之间没有连接。

-![bi_rnn](./image/bi_rnn.png)
-<p align="center">
-图3. 按时间步展开的双向循环神经网络
-</p>
+
+<div align="center">
+<img src = "https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/bi_rnn.png?raw=true" width="400"><br/>
+图2. 按时间步展开的双向循环神经网络
+</div>

 ### 编码器-解码器框架

 编码器-解码器（Encoder-Decoder）\[[2](#参考文献)\]框架用于解决由一个任意长度的源序列到另一个任意长度的目标序列的变换问题。即编码阶段将整个源序列编码成一个向量，解码阶段通过最大化预测序列概率，从中解码出整个目标序列。编码和解码的过程通常都使用RNN实现。
 ![encoder_decoder](./image/encoder_decoder.png)
-<p align="center">
-图4. 编码器-解码器框架
-</p>
+<div align="center">
+<img src ="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_decoder.png?raw=true" width="400"><br/>
+图3. 编码器-解码器框架
+</div>

+<a name="编码器"></a>
 #### 编码器

 编码阶段分为三步：
@@ -69,19 +72,17 @@
 3. 用RNN编码源语言词序列：这一过程的计算公式为`$h_i=\varnothing _\theta \left ( h_{i-1}, s_i \right )$`，其中`$h_0$`是一个全零的向量，`$\varnothing _\theta$`是一个非线性激活函数，最后得到的`$\mathbf{h}=\left \{ h_1,..., h_T \right \}$`就是RNN依次读入源语言`$T$`个词的状态编码序列。整句话的向量表示可以采用`$\mathbf{h}$`在最后一个时间步`$T$`的状态编码，或使用时间维上的池化（pooling）结果。

 第3步也可以使用双向循环神经网络实现更复杂的句编码表示，具体可以用双向GRU实现。前向GRU按照词序列`$(x_1,x_2,...,x_T)$`的顺序依次编码源语言端词，并得到一系列隐层状态`$(\overrightarrow{h_1},\overrightarrow{h_2},...,\overrightarrow{h_T})$`。类似的，后向GRU按照`$(x_T,x_{T-1},...,x_1)$`的顺序依次编码源语言端词，得到`$(\overleftarrow{h_1},\overleftarrow{h_2},...,\overleftarrow{h_T})$`。最后对于词`$x_i$`，通过拼接两个GRU的结果得到它的隐层状态，即`$h_i=\left [ \overrightarrow{h_i^T},\overleftarrow{h_i^T} \right ]^{T}$`。
-
-![encoder_attention](./image/encoder_attention.png)
-<p align="center">
-图5. 使用双向GRU的编码器
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/image/encoder_attention.png?raw=true" width="400"><br/>
+图4. 使用双向GRU的编码器
+</div>

 #### 解码器

 机器翻译任务的训练过程中，解码阶段的目标是最大化下一个正确的目标语言词的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息（又叫上下文向量，context vector）`$c$`、真实目标语言序列的第`$i$`个词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。计算公式如下：
 $$z_{i+1}=\phi_{\theta '} \left ( c,u_i,z_i \right )$$
-其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用[注意力机制](#注意力机制)时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。
+其中`$\phi _{\theta '}$`是一个非线性激活函数；`$c=q\mathbf{h}$`是源语言句子的上下文向量，在不使用注意力机制时，如果[编码器](#编码器)的输出是源语言句子编码后的最后一个元素，则可以定义`$c=h_T$`；`$u_i$`是目标语言序列的第`$i$`个单词，`$u_0$`是目标语言序列的开始标记`<s>`，表示解码开始；`$z_i$`是`$i$`时刻解码RNN的隐层状态，`$z_0$`是一个全零的向量。

 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。概率分布公式如下：
 $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
@@ -93,6 +94,7 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$

 机器翻译任务的生成过程，通俗来讲就是根据预先训练的模型来翻译源语言句子。生成过程中的解码阶段和上述训练过程的有所差异，具体介绍请见[柱搜索算法](#柱搜索算法)。

+<a name="柱搜索算法"></a>
 ### 柱搜索算法

 柱搜索（[beam search](http://en.wikipedia.org/wiki/Beam_search)）是一种启发式图搜索算法，用于在图或树中搜索有限集合中的最优扩展节点，通常用在解空间非常大的系统（如机器翻译、语音识别）中，原因是内存无法装下图或树中所有展开的解。如在机器翻译任务中希望翻译“`<s>你好<e>`”，就算目标语言字典中只有3个词（`<s>`, `<e>`, `hello`），也可能生成无限句话（`hello`循环出现的次数不定），为了找到其中较好的翻译结果，我们可采用柱搜索算法。
@@ -100,7 +102,6 @@ $$p\left ( u_{i+1}|u_{&lt;i+1},\mathbf{x} \right )=softmax(W_sz_{i+1}+b_z)$$
 柱搜索算法使用广度优先策略建立搜索树，在树的每一层，按照启发代价（heuristic cost）（本教程中，为生成词的log概率之和）对节点进行排序，然后仅留下预先确定的个数（文献中通常称为beam width、beam size、柱宽度等）的节点。只有这些节点会在下一层继续扩展，其他节点就被剪掉了，也就是说保留了质量较高的节点，剪枝了质量较差的节点。因此，搜索所占用的空间和时间大幅减少，但缺点是无法保证一定获得最优解。

 使用柱搜索算法的解码阶段，目标是最大化生成序列的概率。思路是：
-
 1. 每一个时刻，根据源语言句子的编码信息`$c$`、生成的第`$i$`个目标语言序列单词`$u_i$`和`$i$`时刻RNN的隐层状态`$z_i$`，计算出下一个隐层状态`$z_{i+1}$`。

 2. 将`$z_{i+1}$`通过`softmax`归一化，得到目标语言序列的第`$i+1$`个单词的概率分布`$p_{i+1}$`。

--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/bi_rnn_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/decoder_attention_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_attention_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/encoder_decoder_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/gru_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt.png
--- a/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/machine_translation/image/nmt_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/.gitignore
+.idea
+.ipynb_checkpoints
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/README.cn.md
@@ -37,7 +37,7 @@ Prediction Score is 4.25
 YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐系统为超过10亿用户从不断增长的视频库中推荐个性化的内容。整个系统由两个神经网络组成：候选生成网络和排序网络。候选生成网络从百万量级的视频库中生成上百个候选，排序网络对候选进行打分排序，输出排名最高的数十个结果。系统结构如图1所示：

 <p align="center">
-<img src="image/YouTube_Overview.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/YouTube_Overview.png?raw=true" width="70%" ><br/>
 图1. YouTube 推荐系统结构
 </p>

@@ -48,7 +48,7 @@ YouTube是世界上最大的视频上传、分享和发现网站，YouTube推荐
 首先，将观看历史及搜索词记录这类历史信息，映射为向量后取平均值得到定长表示；同时，输入人口学特征以优化新用户的推荐效果，并将二值特征和连续特征归一化处理到[0, 1]范围。接下来，将所有特征表示拼接为一个向量，并输入给非线形多层感知器（MLP，详见[识别数字](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md)教程）处理。最后，训练时将MLP的输出给softmax做分类，预测时计算用户的综合特征（MLP的输出）与所有视频的相似度，取得分最高的$k$个作为候选生成网络的筛选结果。图2显示了候选生成网络结构。

 <p align="center">
-<img src="image/Deep_candidate_generation_model_architecture.png" width="70%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/Deep_candidate_generation_model_architecture.png?raw=true" width="70%" ><br/>
 图2. 候选生成网络结构
 </p>

@@ -73,7 +73,7 @@ $$P(\omega=i|u)=\frac{e^{v_{i}u}}{\sum_{j \in V}e^{v_{j}u}}$$
 卷积神经网络主要由卷积（convolution）和池化（pooling）操作构成，其应用及组合方式灵活多变，种类繁多。本小结我们以如图3所示的网络进行讲解：

 <p align="center">
-<img src="image/text_cnn.png" width = "80%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/text_cnn.png?raw=true" width = "80%" align="center"/><br/>
 图3. 卷积神经网络文本分类模型
 </p>

@@ -107,7 +107,7 @@ $$\hat c=max(c)$$

 <p align="center">

-<img src="image/rec_regression_network.png" width="90%" ><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/05.recommender_system/image/rec_regression_network.png?raw=true" width="90%" ><br/>
 图4. 融合推荐模型
 </p>


--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/Deep_candidate_generation_model_architecture.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/YouTube_Overview.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/output_32_0.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/rec_regression_network_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn.png
--- a/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/recommender_system/image/text_cnn_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/.gitignore
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/README.cn.md
@@ -37,7 +37,7 @@
 循环神经网络是一种能对序列数据进行精确建模的有力工具。实际上，循环神经网络的理论计算能力是图灵完备的\[[4](#参考文献)\]。自然语言是一种典型的序列数据（词序列），近年来，循环神经网络及其变体（如long short term memory\[[5](#参考文献)\]等）在自然语言处理的多个领域，如语言模型、句法解析、语义角色标注（或一般的序列标注）、语义表示、图文生成、对话、机器翻译等任务上均表现优异甚至成为目前效果最好的方法。

 <p align="center">
-<img src="image/rnn.png" width = "60%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/rnn.png?raw=true" width = "60%" align="center"/><br/>
 图1. 循环神经网络按时间展开的示意图
 </p>

@@ -66,7 +66,7 @@ $$ h_t = o_t\odot tanh(c_t) $$
 其中，$i_t, f_t, c_t, o_t$分别表示输入门，遗忘门，记忆单元及输出门的向量值，带角标的$W$及$b$为模型参数，$tanh$为双曲正切函数，$\odot$表示逐元素（elementwise）的乘法操作。输入门控制着新输入进入记忆单元$c$的强度，遗忘门控制着记忆单元维持上一时刻值的强度，输出门控制着输出记忆单元的强度。三种门的计算方式类似，但有着完全不同的参数，它们各自以不同的方式控制着记忆单元$c$，如图2所示：

 <p align="center">
-<img src="image/lstm.png" width = "65%" align="center"/><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/lstm.png?raw=true" width = "65%" align="center"/><br/>
 图2. 时刻$t$的LSTM [7]
 </p>

@@ -83,7 +83,7 @@ $$ h_t=Recrurent(x_t,h_{t-1})$$
 如图3所示（以三层为例），奇数层LSTM正向，偶数层LSTM反向，高一层的LSTM使用低一层LSTM及之前所有层的信息作为输入，对最高层LSTM序列使用时间维度上的最大池化即可得到文本的定长向量表示（这一表示充分融合了文本的上下文信息，并且对文本进行了深层次抽象），最后我们将文本表示连接至softmax构建分类模型。

 <p align="center">
-<img src="image/stacked_lstm.jpg" width=450><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/06.understand_sentiment/image/stacked_lstm.jpg?raw=true" width=450><br/>
 图3. 栈式双向LSTM用于文本分类
 </p>

@@ -149,6 +149,8 @@ def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):

 网络的输入`input_dim`表示的是词典的大小，`class_dim`表示类别数。这里，我们使用[`sequence_conv_pool`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/trainer_config_helpers/networks.py) API实现了卷积和池化操作。

+<a name="栈值双向LSTM"></a>
+
 ### 栈式双向LSTM

 栈式双向神经网络`stacked_lstm_net`的代码片段如下：

--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm.png
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/lstm_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/rnn.png
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm.jpg
--- a/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/understand_sentiment/image/stacked_lstm_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/.gitignore
+data/train.list
+data/test.list
+data/simple-examples*
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/README.cn.md
@@ -34,7 +34,7 @@ $$X = USV^T$$
 本章中，当词向量训练好后，我们可以用数据可视化算法t-SNE\[[4](#参考文献)\]画出词语特征在二维上的投影（如下图所示）。从图中可以看出，语义相关的词语（如a, the, these; big, huge）在投影上距离很近，语意无关的词（如say, business; decision, japan）在投影上的距离很远。

 <p align="center">
-    <img src = "image/2d_similarity.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/2d_similarity.png?raw=true" width=400><br/>
    图1. 词向量的二维投影
 </p>

@@ -50,7 +50,7 @@ similarity: -0.0997506977351

 ```

-以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[应用模型](#应用模型)中详细描述用法。
+以上结果可以通过运行`calculate_dis.py`, 加载字典里的单词和对应训练特征结果得到，我们将在[模型应用](#模型应用)中详细描述用法。


 ## 模型概览
@@ -90,7 +90,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 其中$f(w_t, w_{t-1}, ..., w_{t-n+1})$表示根据历史n-1个词得到当前词$w_t$的条件概率，$R(\theta)$表示参数正则项。

 <p align="center">
-       <img src="image/nnlm.png" width=500><br/>
+       <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/nnlm.png?raw=true" width=500><br/>
       图2. N-gram神经网络模型
 </p>

@@ -122,7 +122,7 @@ $$\frac{1}{T}\sum_t f(w_t, w_{t-1}, ..., w_{t-n+1};\theta) + R(\theta)$$
 CBOW模型通过一个词的上下文（各N个词）预测当前词。当N=2时，模型如下图所示：

 <p align="center">
-    <img src="image/cbow.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/cbow.png?raw=true" width=250><br/>
    图3. CBOW模型
 </p>

@@ -137,7 +137,7 @@ $$context = \frac{x_{t-1} + x_{t-2} + x_{t+1} + x_{t+2}}{4}$$
 CBOW的好处是对上下文词语的分布在词向量上进行了平滑，去掉了噪声，因此在小数据集上很有效。而Skip-gram的方法中，用一个词预测其上下文，得到了当前词上下文的很多样本，因此可用于更大的数据集。

 <p align="center">
-    <img src="image/skipgram.png" width=250><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/skipgram.png?raw=true" width=250><br/>
    图4. Skip-gram模型
 </p>

@@ -189,12 +189,13 @@ dream that one day <e>

 最后，每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。

+<a name="训练模型"></a>
 ## 编程实现

 本配置的模型结构如下图所示：

 <p align="center">
-    <img src="image/ngram.png" width=400><br/>
+    <img src="https://github.com/PaddlePaddle/book/blob/develop/04.word2vec/image/ngram.png?raw=true" width=400><br/>
    图5. 模型配置中的N-gram神经网络模型
 </p>

@@ -349,6 +350,7 @@ Step 20: Average Cost 5.766995
 ...
 ```

+<a name="模型应用"></a>
 ## 模型应用
 在模型训练后，我们可以用它做一些预测。


--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/2d_similarity.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/cbow_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/ngram.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/nnlm_en.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/sentence_emb.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram.png
--- a/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
+++ b/doc/fluid/new_docs/beginners_guide/basics/word2vec/image/skipgram_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/README.cn.md
@@ -15,7 +15,7 @@ $$y_i = \omega_1x_{i1} + \omega_2x_{i2} + \ldots + \omega_dx_{id} + b,  i=1,\ldo
 ## 效果展示
 我们使用从[UCI Housing Data Set](https://archive.ics.uci.edu/ml/datasets/Housing)获得的波士顿房价数据集进行模型的训练和预测。下面的散点图展示了使用模型对部分房屋价格进行的预测。其中，每个点的横坐标表示同一类房屋真实价格的中位数，纵坐标表示线性回归模型根据特征预测的结果，当二者值完全相等的时候就会落在虚线上。所以模型预测得越准确，则点离虚线越近。
 <p align="center">
-    <img src = "image/predictions.png" width=400><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/predictions.png?raw=true" width=400><br/>
    图1. 预测值 V.S. 真实值
 </p>

@@ -40,13 +40,9 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 ### 训练过程

 定义好模型结构之后，我们要通过以下几个步骤进行模型训练
-
 1. 初始化参数，其中包括权重$\omega_i$和偏置$b$，对其进行初始化（如0均值，1方差）。
-
 2. 网络正向传播计算网络输出和损失函数。
-
 3. 根据损失函数进行反向误差传播 （[backpropagation](https://en.wikipedia.org/wiki/Backpropagation)），将网络误差从输出层依次向前传递, 并更新网络中的参数。
-
 4. 重复2~3步骤，直至网络训练误差达到规定的程度或训练轮次达到设定值。

 ## 数据集
@@ -84,7 +80,7 @@ $$MSE=\frac{1}{n}\sum_{i=1}^{n}{(\hat{Y_i}-Y_i)}^2$$
 - 很多的机器学习技巧/模型（例如L1，L2正则项，向量空间模型-Vector Space Model）都基于这样的假设：所有的属性取值都差不多是以0为均值且取值范围相近的。

 <p align="center">
-    <img src = "image/ranges.png" width=550><br/>
+    <img src = "https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/ranges.png?raw=true" width=550><br/>
    图2. 各维属性的取值范围
 </p>

@@ -199,10 +195,12 @@ step = 0
 def event_handler_plot(event):
    global step
    if isinstance(event, fluid.EndStepEvent):
-        if event.step % 10 == 0: # record the test cost every 10 seconds
+        if step % 10 == 0:   # record a train cost every 10 batches
+            plot_cost.append(train_title, step, event.metrics[0])
+
+        if step % 100 == 0:  # record a test cost every 100 batches
            test_metrics = trainer.test(
                reader=test_reader, feed_order=feed_order)
-
            plot_cost.append(test_title, step, test_metrics[0])
            plot_cost.plot()

@@ -210,12 +208,13 @@ def event_handler_plot(event):
                # If the accuracy is good enough, we can stop the training.
                print('loss is less than 10.0, stop')
                trainer.stop()
+        step += 1

+    if isinstance(event, fluid.EndEpochEvent):
+        if event.epoch % 10 == 0:
            # We can save the trained parameters for the inferences later
            if params_dirname is not None:
                trainer.save_params(params_dirname)
-
-        step += 1
 ```

 ### 开始训练
@@ -231,11 +230,10 @@ trainer.train(
    event_handler=event_handler_plot,
    feed_order=feed_order)
 ```
-
-<p align="center">
-    <img src = "image/train_and_test1.png" width=400><br/>
-    图3. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/01.fit_a_line/image/train_and_test.png?raw=true" width="400"><br/>
+图3 训练结果
+</div>


 ## 预测
@@ -262,18 +260,18 @@ inferencer = fluid.Inferencer(
 batch_size = 10
 test_reader = paddle.batch(paddle.dataset.uci_housing.test(),batch_size=batch_size)
 test_data = test_reader().next()
-test_feat = numpy.array([data[0] for data in test_data]).astype("float32")
-test_label = numpy.array([data[1] for data in test_data]).astype("float32")
+test_x = numpy.array([data[0] for data in test_data]).astype("float32")
+test_y = numpy.array([data[1] for data in test_data]).astype("float32")

-results = inferencer.infer({'x': test_feat})
+results = inferencer.infer({'x': test_x})

 print("infer results: (House Price)")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, results[0][k]))
+for idx, val in enumerate(results[0]):
+    print("%d: %.2f" % (idx, val))

 print("\nground truth:")
-for k in range(0, batch_size-1):
-    print("%d. %f" % (k, test_label[k]))
+for idx, val in enumerate(test_y):
+    print("%d: %.2f" % (idx, val))
 ```

 ## 总结

--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/predictions_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/ranges_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/fit_a_line/image/train_and_test1.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/README.cn.md
@@ -6,8 +6,8 @@
 当我们学习编程的时候，编写的第一个程序一般是实现打印"Hello World"。而机器学习（或深度学习）的入门教程，一般都是 [MNIST](http://yann.lecun.com/exdb/mnist/) 数据库上的手写识别问题。原因是手写识别属于典型的图像分类问题，比较简单，同时MNIST数据集也很完备。MNIST数据集作为一个简单的计算机视觉数据集，包含一系列如图1所示的手写数字图片和对应的标签。图片是28x28的像素矩阵，标签则对应着0~9的10个数字。每张图片都经过了大小归一化和居中处理。

 <p align="center">
-    <img src="image/mnist_example_image.png" width="400"><br/>
-    图1. MNIST图片示例
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mnist_example_image.png?raw=true" width="400"><br/>
+图1. MNIST图片示例
 </p>

 MNIST数据集是从 [NIST](https://www.nist.gov/srd/nist-special-database-19) 的Special Database 3（SD-3）和Special Database 1（SD-1）构建而来。由于SD-3是由美国人口调查局的员工进行标注，SD-1是由美国高中生进行标注，因此SD-3比SD-1更干净也更容易识别。Yann LeCun等人从SD-1和SD-3中各取一半作为MNIST的训练集（60000条数据）和测试集（10000条数据），其中训练集来自250位不同的标注员，此外还保证了训练集和测试集的标注员是不完全相同的。
@@ -40,12 +40,12 @@ $$ y_i = \text{softmax}(\sum_j W_{i,j}x_j + b_i) $$

 在分类问题中，我们一般采用交叉熵代价损失函数（cross entropy loss），公式如下：

-$$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
+$$  L_{cross-entropy}(label, y) = -\sum_i label_ilog(y_i) $$

 图2为softmax回归的网络图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。

 <p align="center">
-<img src="image/softmax_regression.png" width=400><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/softmax_regression.png?raw=true" width=400><br/>
 图2. softmax回归网络结构图<br/>
 </p>

@@ -54,16 +54,14 @@ $$  L_{cross-entropy} (label, y) = -\sum_i label_ilog(y_i) $$
 Softmax回归模型采用了最简单的两层神经网络，即只有输入层和输出层，因此其拟合能力有限。为了达到更好的识别效果，我们考虑在输入层和输出层中间加上若干个隐藏层\[[10](#参考文献)\]。

 1.  经过第一个隐藏层，可以得到 $ H_1 = \phi(W_1X + b_1) $，其中$\phi$代表激活函数，常见的有sigmoid、tanh或ReLU等函数。
-
 2.  经过第二个隐藏层，可以得到 $ H_2 = \phi(W_2H_1 + b_2) $。
-
 3.  最后，再经过输出层，得到的$Y=\text{softmax}(W_3H_2 + b_3)$，即为最后的分类结果向量。


 图3为多层感知器的网络结构图，图中权重用蓝线表示、偏置用红线表示、+1代表偏置参数的系数为1。

 <p align="center">
-<img src="image/mlp.png" width=500><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/mlp.png?raw=true" width=500><br/>
 图3. 多层感知器网络结构图<br/>
 </p>

@@ -72,7 +70,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 在多层感知器模型中，将图像展开成一维向量输入到网络中，忽略了图像的位置和结构信息，而卷积神经网络能够更好的利用图像的结构信息。[LeNet-5](http://yann.lecun.com/exdb/lenet/)是一个较简单的卷积神经网络。图4显示了其结构：输入的二维图像，先经过两次卷积层到池化层，再经过全连接层，最后使用softmax分类作为输出层。下面我们主要介绍卷积层和池化层。

 <p align="center">
-<img src="image/cnn.png"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/cnn.png?raw=true" width="400"><br/>
 图4. LeNet-5卷积神经网络结构<br/>
 </p>

@@ -81,7 +79,7 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 卷积层是卷积神经网络的核心基石。在图像识别里我们提到的卷积是二维卷积，即离散二维滤波器（也称作卷积核）与二维图像做卷积操作，简单的讲是二维滤波器滑动到二维图像上所有位置，并在每个位置上与该像素点及其领域像素点做内积。卷积操作被广泛应用与图像处理领域，不同卷积核可以提取不同的特征，例如边沿、线性、角等特征。在深层卷积神经网络中，通过卷积操作可以提取出图像低级到复杂的特征。

 <p align="center">
-<img src="image/conv_layer.png" width='750'><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/conv_layer.png?raw=true" width='750'><br/>
 图5. 卷积层图片<br/>
 </p>

@@ -98,16 +96,15 @@ Softmax回归模型采用了最简单的两层神经网络，即只有输入层
 #### 池化层

 <p align="center">
-<img src="image/max_pooling.png" width="400px"><br/>
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/max_pooling.png?raw=true" width="400px"><br/>
 图6. 池化层图片<br/>
 </p>

 池化是非线性下采样的一种形式，主要作用是通过减少网络的参数来减小计算量，并且能够在一定程度上控制过拟合。通常在卷积层的后面会加上一个池化层。池化包括最大池化、平均池化等。其中最大池化是用不重叠的矩形框将输入层分成不同的区域，对于每个矩形框的数取最大值作为输出层，如图6所示。

-更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类](https://github.com/PaddlePaddle/book/blob/develop/image_classification/README.md)教程。
+更详细的关于卷积神经网络的具体知识可以参考[斯坦福大学公开课]( http://cs231n.github.io/convolutional-networks/ )和[图像分类]( https://github.com/PaddlePaddle/book/tree/develop/03.image_classification )教程。

 ### 常见激活函数介绍  
-
 - sigmoid激活函数： $ f(x) = sigmoid(x) = \frac{1}{1+e^{-x}} $

 - tanh激活函数： $ f(x) = tanh(x) = \frac{e^x-e^{-x}}{e^x+e^{-x}} $
@@ -136,20 +133,18 @@ PaddlePaddle在API中提供了自动加载[MNIST](http://yann.lecun.com/exdb/mni
 我们建议使用 Fluid API，因为它更容易学起来。

 下面是快速的 Fluid API 概述。
-
 1. `inference_program`：指定如何从数据输入中获得预测的函数。
 这是指定网络流的地方。

-2. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
+1. `train_program`：指定如何从 `inference_program` 和`标签值`中获取 `loss` 的函数。
 这是指定损失计算的地方。

-3. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。
+1. `optimizer_func`: “指定优化器配置的函数。优化器负责减少损失并驱动培训。Paddle 支持多种不同的优化器。

-4. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
+1. `Trainer`：PaddlePaddle Trainer 管理由 `train_program` 和 `optimizer` 指定的训练过程。
 通过 `event_handler` 回调函数，用户可以监控培训的进展。

-5. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
-
+1. `Inferencer`：Fluid inferencer 加载 `inference_program` 和由 Trainer 训练的参数。
 然后，它可以推断数据和返回预测。

 在这个演示中，我们将深入了解它们。
@@ -240,6 +235,7 @@ def train_program():
    acc = fluid.layers.accuracy(input=predict, label=label)
    return [avg_cost, acc]

+
 ```

 #### Optimizer Function 配置
@@ -255,9 +251,9 @@ def optimizer_program():

 下一步，我们开始训练过程。`paddle.dataset.movielens.train()`和`paddle.dataset.movielens.test()`分别做训练和测试数据集。这两个函数各自返回一个reader——PaddlePaddle中的reader是一个Python函数，每次调用的时候返回一个Python yield generator。

-下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B 。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。
+下面`shuffle`是一个reader decorator，它接受一个reader A，返回另一个reader B。reader B 每次读入`buffer_size`条训练数据到一个buffer里，然后随机打乱其顺序，并且逐条输出。

-`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader 。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。
+`batch`是一个特殊的decorator，它的输入是一个reader，输出是一个batched reader。在PaddlePaddle里，一个reader每次yield一条训练数据，而一个batched reader每次yield一个minibatch。

 ```python
 train_reader = paddle.batch(
@@ -280,7 +276,6 @@ place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

 trainer = fluid.Trainer(
    train_func=train_program, place=place, optimizer_func=optimizer_program)
-
 ```

 #### Event Handler 配置
@@ -315,11 +310,10 @@ def event_handler(event):

 `event_handler_plot` 可以用来在训练过程中画图如下：

-
-<p align="center">
-<img src="image/train_and_test2.png" width="400"><br/>
-图7. 训练结果
-</p>
+<div align="center">
+<img src="https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/image/train_and_test.png?raw=true" width="400"><br/>
+图7 训练结果
+</div>


 ```python

--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/cnn_train_log_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/conv_layer.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/infer_3.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/max_pooling_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mlp_train_log_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/mnist_example_image.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_regression_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/softmax_train_log_en.png
--- a/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png
+++ b/doc/fluid/new_docs/beginners_guide/quick_start/recognize_digits/image/train_and_test2.png
--- a/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
+++ b/doc/fluid/new_docs/user_guides/howto/debug/visualdl.md
@@ -149,7 +149,7 @@ python setup.py bdist_wheel
 pip install --upgrade dist/visualdl-*.whl
 ```

-如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/how_to_dev_frontend_en.md)
+如果打包和安装遇到其他问题，不安装只想运行Visual DL可以看[这里](https://github.com/PaddlePaddle/VisualDL/blob/develop/docs/develop/how_to_dev_frontend_cn.md)


 ## SDK

--- a/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/build_and_install_lib_cn.rst
--- a/doc/fluid/new_docs/user_guides/howto/inference/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/inference/index.rst
+############
+模型预测部署
+############
+
+PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线
+
+.. toctree::
+   :maxdepth: 2
+
+   build_and_install_lib_cn.rst
+   native_infer.rst
--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -4,12 +4,13 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。

-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`_
 包括

 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
 -  库文件\ ``libpaddle_fluid.so`` 或 ``libpaddle_fluid.a``

+
 编译和依赖可以参考 :ref:`install_or_build_cpp_inference_lib` 。

 下面是一些 API 概念的介绍
@@ -95,7 +96,7 @@ engine
    CHECK(predictor->Run(slots, &outputs));
    // 获取 outputs ...

-编译时，联编 ``libpaddle_fluid.a/.so`` 即可。
+编译时，联编 ``libpaddle_fluid.a/.so`` 便可。

 详细代码参考
 ------------

--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/index.rst
@@ -38,7 +38,6 @@ PaddlePaddle Fluid支持两种传入数据的方式:
   :maxdepth: 2

   feeding_data
-   use_recordio_reader

 Python Reader
 #############

--- a/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
+++ b/doc/fluid/new_docs/user_guides/howto/prepare_data/use_recordio_reader.rst
-.. _user_guide_use_recordio_as_train_data:
-
-############################
-使用RecordIO文件作为训练数据
-############################
-
-相比于 :ref:`user_guide_use_numpy_array_as_train_data`，
-:ref:`user_guide_use_recordio_as_train_data` 的性能更好；
-但是用户需要先将训练数据集转换成RecordIO文件格式，再使用
-:code:`fluid.layers.open_files()` 层在神经网络配置中导入 RecordIO 文件。
-用户还可以使用 :code:`fluid.layers.double_buffer()` 加速数据从内存到显存的拷贝，
-使用 :code:`fluid.layers.Preprocessor` 工具进行数据增强。
-
-将训练数据转换成RecordIO文件格式
-################################
-
-:code:`fluid.recordio_writer` 中，每个记录都是一个
-:code:`vector<LoDTensor>`, 即一个支持序列信息的Tensor数组。这个数组包括训练所需
-的所有特征。例如对于图像分类来说，这个数组可以包含图片和分类标签。
-
-用户可以使用 :code:`fluid.recordio_writer.convert_reader_to_recordio_file()` 可以将
-:ref:`user_guide_reader` 转换成一个RecordIO文件。或者可以使用
-:code:`fluid.recordio_writer.convert_reader_to_recordio_files()` 将一个
-:ref:`user_guide_reader` 转换成多个RecordIO文件。
-
-具体使用方法为:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   import numpy
-
-   def reader_creator():
-       def __impl__():
-           for i in range(1000):
-               yield [
-                        numpy.random.random(size=[3,224,224], dtype="float32"),
-                        numpy.random.random(size=[1], dtype="int64")
-                     ]
-       return __impl__
-
-   img = fluid.layers.data(name="image", shape=[3, 224, 224])
-   label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-   feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
-
-   BATCH_SIZE = 32
-   reader = paddle.batch(reader_creator(), batch_size=BATCH_SIZE)
-   fluid.recordio_writer.convert_reader_to_recordio_file(
-      "train.recordio", feeder=feeder, reader_creator=reader)
-
-其中 :code:`reader_creator` 创建了一个 :code:`Reader`。
-:ref:`_api_fluid_data_feeder_DataFeeder`
-是将 :code:`Reader` 转换成 :code:`LoDTensor` 的工具。详细请参考
-:ref:`user_guide_reader` 。
-
-上述程序将 :code:`reader_creator` 的数据转换成了 :code:`train.recordio` 文件，
-其中每一个record 含有 32 条样本。如果batch size会在训练过程中调整，
-用户可以将每一个Record的样本数设置成1。并参考
-:ref:`user_guide_use_recordio_as_train_data_use_op_create_batch`。
-
-
-配置神经网络, 打开RecordIO文件
-##############################
-
-RecordIO文件转换好之后，用户可以使用 :code:`fluid.layers.open_files()`
-打开文件，并使用 :code:`fluid.layers.read_file` 读取文件内容。
-简单使用方法如下:
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-
-   file_obj = fluid.layers.open_files(
-     filenames=["train.recordio"],
-     shape=[[3, 224, 224], [1]],
-     lod_levels=[0, 0],
-     dtypes=["float32", "int64"],
-     pass_num=100
-   )
-
-   image, label = fluid.layers.read_file(file_obj)
-
-其中如果设置了 :code:`pass_num` ，那么当所有数据读完后，会重新读取数据，
-直到读取了 :code:`pass_num` 遍。
-
-
-
-进阶使用
-########
-
-
-使用 :code:`fluid.layers.double_buffer()`
------------------------------------------
-
-:code:`Double buffer` 使用双缓冲技术，将训练数据从内存中复制到显存中。配置双缓冲
-需要使用 :code:`fluid.layers.double_buffer()` 修饰文件对象。 例如:
-
-.. code-block:: python
-
-   import paddle.fliud as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.double_buffer(file_obj)
-
-   image, label = fluid.layers.read_file(file_obj)
-
-双缓冲技术可以参考
-`Multiple buffering <https://en.wikipedia.org/wiki/Multiple_buffering>`_ 。
-
-配置数据增强
------------
-
-使用 :code:`fluid.layers.Preprocessor` 可以配置文件的数据增强方法。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   preprocessor = fluid.layers.Preprocessor(reader=data_file)
-   with preprocessor.block():
-       image, label = preprocessor.inputs()
-       image = image / 2
-       label = label + 1
-       preprocessor.outputs(image, label)
-
-如上代码所示，使用 :code:`Preprocessor` 定义了一个数据增强模块，并在
-:code:`with preprocessor.block()` 中定义了数据增强的具体操作。 用户通过配置
-:code:`preprocessor.inputs()` 获得数据文件中的各个字段。 并用
-:code:`preprocessor.outputs()` 标记预处理后的输出。
-
-.. _user_guide_use_recordio_as_train_data_use_op_create_batch:
-
-使用Op组batch
-------------
-
-使用 :code:`fluid.layers.batch()` 可以在训练的过程中动态的组batch。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fluid.layers.batch(file_obj, batch_size=32)
-
-   img, label = fluid.layers.read_file(file_obj)
-
-需要注意的是，如果数据集中的最后几个样本不能组成 :code:`batch_size` 大小的批量数据，
-那么这几个样本直接组成一个批量数据进行训练。
-
-读入数据的shuffle
-----------------
-
-使用 :code:`fluid.layers.shuffle()` 可以在训练过程中动态重排训练数据。例如
-
-.. code-block:: python
-
-   import paddle.fluid as fluid
-   file_obj = fluid.layers.open_files(...)
-   file_obj = fliud.layers.shuffle(file_obj, buffer_size=8192)
-
-   img, label = fliud.layers.read_file(file_obj)
-
-需要注意的是:
-
-1. :code:`shuffle` 实现方法是:
-先读入 :code:`buffer_size` 条样本，再随机的选出样本进行训练。
-
-2. :code:`shuffle` 中 :code:`buffer_size` 会占用训练内存，需要确定训练过程中内存
-足够支持缓存 :code:`buffer_size` 条数据。
--- a/doc/fluid/new_docs/user_guides/index.rst
+++ b/doc/fluid/new_docs/user_guides/index.rst
@@ -15,4 +15,5 @@
    howto/training/index
    howto/debug/index
    howto/evaluation/index
+    howto/inference/index
    models/index.rst
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -172,6 +172,7 @@ paddle.fluid.layers.sequence_mask ArgSpec(args=['x', 'maxlen', 'dtype', 'name'],
 paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,))
 paddle.fluid.layers.pad2d ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None))
 paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None))
+paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -311,7 +312,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
-paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 200, 1))
+paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
@@ -375,7 +376,7 @@ paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'l
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))

--- a/paddle/fluid/framework/.gitignore
+++ b/paddle/fluid/framework/.gitignore
+.tensor_util.cu
+.data_type_transform.cu
\ No newline at end of file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
+# windows treat symbolic file as a real file, which is different with unix
+# We create a hidden file and compile it instead of origin source file.
+function(windows_symbolic TARGET)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  foreach(src ${windows_symbolic_SRCS})
+  get_filename_component(src ${src} NAME_WE)
+  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
+      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
+  endif()
+  add_custom_command(OUTPUT .${src}.cu 
+          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
+          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
+          COMMENT "create hidden file of ${src}.cu")
+  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)  
+  endforeach()
+endfunction()
+
 add_subdirectory(ir)
 if (NOT WIN32)
 add_subdirectory(details)
@@ -11,7 +30,13 @@ nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
+  if (WIN32)
+    windows_symbolic(tensor_util SRCS tensor_util.cu)
+    nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
+    add_dependencies(tensor tensor_util)
+  else()
    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+  endif(WIN32)
 else()
  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
@@ -55,7 +80,13 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
        DEPS operator op_registry device_context math_function)

 if(WITH_GPU)
+  if (WIN32)
+      windows_symbolic(hidden_file SRCS data_type_transform.cu)
+      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
+      add_dependencies(data_type_transform hidden_file)
+  else()
      nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
+  endif(WIN32)
  nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
 else()
  cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -326,7 +326,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
  ir::Graph &result = *graph;

  for (auto &node : nodes) {
-    if (node->NodeType() == ir::Node::Type::kVariable && node->Var()) {
+    if (node->IsVar() && node->Var()) {
      all_vars_.emplace(node->Name(), node->Var());
    }
  }
@@ -583,18 +583,6 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
  }
 }

-bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
-    const std::string &og,
-    std::unordered_set<std::string> *og_has_been_broadcast) const {
-  bool is_pg_once =
-      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
-  if (is_pg_once) {
-    // Insert NCCL AllReduce Op
-    og_has_been_broadcast->insert(og);
-  }
-  return is_pg_once;
-}
-
 int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph,
                                           ir::Node *node) const {
  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
@@ -688,20 +676,6 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
  return var;
 }

-// Find the first occurence of `prev_op_name` and make current `op` depend
-// on it.
-void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op,
-                                        const std::string &prev_op_name) const {
-  for (auto &prev_op : result->Get<GraphOps>(kGraphOps)) {
-    if (prev_op->Name() == prev_op_name) {
-      auto *dep_var = new DummyVarHandle(result->CreateControlDepVar());
-      prev_op->AddOutput(dep_var);
-      result->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-      op->AddInput(dep_var);
-    }
-  }
-}
-
 void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                                ir::Node *node) const {
  int op_dev_id = -1;

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -69,9 +69,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  std::vector<std::string> FindDistTrainRecvVars(
      const std::vector<ir::Node *> &nodes) const;

-  void ConnectOp(ir::Graph *result, OpHandleBase *op,
-                 const std::string &prev_op_name) const;
-
  void CreateComputationalOps(ir::Graph *result, ir::Node *node,
                              size_t num_places) const;

@@ -83,10 +80,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  void CreateComputationalOp(ir::Graph *result, ir::Node *node,
                             int dev_id) const;

-  bool IsParameterGradientOnce(
-      const std::string &og,
-      std::unordered_set<std::string> *og_has_been_broadcast) const;
-
  int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const;

  void InsertAllReduceOp(ir::Graph *result, const std::string &og) const;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
+set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
+function(pass_library TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass)
+    file(APPEND ${pass_file} "USE_PASS(${TARGET});\n")
+    set(PASS_LIBRARY ${TARGET} ${PASS_LIBRARY} PARENT_SCOPE)
+endfunction()
+
 cc_library(node SRCS node.cc DEPS proto_desc)
 cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
-cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
-cc_library(graph_to_program_pass SRCS graph_to_program_pass.cc DEPS graph pass graph_helper)
 cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits)
-cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(attention_lstm_fuse_pass SRCS attention_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass)
-cc_library(fc_lstm_fuse_pass SRCS fc_lstm_fuse_pass.cc DEPS graph graph_pattern_detector)
-cc_library(seq_concat_fc_fuse_pass SRCS seq_concat_fc_fuse_pass.cc DEPS graph graph_pattern_detector)
+
+pass_library(graph_to_program_pass)
+pass_library(graph_viz_pass)
+pass_library(fc_fuse_pass)
+pass_library(attention_lstm_fuse_pass)
+pass_library(infer_clean_graph_pass)
+pass_library(fc_lstm_fuse_pass)
+pass_library(seq_concat_fc_fuse_pass)
+set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
-cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detector graph pass graph_traits framework_proto)
+cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/helper.h"

 namespace paddle {
 namespace framework {
@@ -96,17 +96,13 @@ void FindWhileOp(Graph* graph) {
  auto* cell_init = graph->RetriveNode(6);
  auto* hidden_init = graph->RetriveNode(8);

-#define LINK_TO(node0, node1)      \
-  node0->outputs.push_back(node1); \
-  node1->inputs.push_back(node0);
-
  auto* lstm_op = graph->CreateOpNode(&op_desc);
  PrepareParameters(graph, param);

-  LINK_TO(X, lstm_op);
-  LINK_TO(cell_init, lstm_op);
-  LINK_TO(hidden_init, lstm_op);
-  LINK_TO(lstm_op, LSTMOUT);
+  IR_NODE_LINK_TO(X, lstm_op);
+  IR_NODE_LINK_TO(cell_init, lstm_op);
+  IR_NODE_LINK_TO(hidden_init, lstm_op);
+  IR_NODE_LINK_TO(lstm_op, LSTMOUT);

  GraphSafeRemoveNodes(graph, marked_nodes);
 }
@@ -216,11 +212,11 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,

  float* out_data = out->mutable_data<float>(platform::CPUPlace());
  std::array<const float*, 4> tensors(
-      {W_forget_w0.data<float>(), W_input_w0.data<float>(),
-       W_output_w0.data<float>(), W_cell_w0.data<float>()});
+      {{W_forget_w0.data<float>(), W_input_w0.data<float>(),
+        W_output_w0.data<float>(), W_cell_w0.data<float>()}});
  std::array<const float*, 4> tensors1(
-      {W_forget_w1.data<float>(), W_input_w1.data<float>(),
-       W_output_w1.data<float>(), W_cell_w1.data<float>()});
+      {{W_forget_w1.data<float>(), W_input_w1.data<float>(),
+        W_output_w1.data<float>(), W_cell_w1.data<float>()}});

  for (int row = 0; row < D; row++) {
    for (int col = 0; col < 4; col++) {
@@ -243,8 +239,8 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
                     const LoDTensor& B_output, const LoDTensor& B_cell,
                     LoDTensor* out) {
  std::array<const float*, 4> tensors(
-      {B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
-       B_cell.data<float>()});
+      {{B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
+        B_cell.data<float>()}});

  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
  int D = B_forget.dims()[0];

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -21,74 +21,26 @@ namespace paddle {
 namespace framework {
 namespace ir {

-bool VarOutLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void BuildFCPattern(PDPattern* pattern) {
-  // Create Operators
-  auto* mul_op = pattern->NewNode("mul")->assert_is_op("mul");
-  auto* elementwise_add_op =
-      pattern->NewNode("elementwise_add")->assert_is_op("elementwise_add");
-  // Create variables
-  // w
-  auto* mul_weight_var = pattern->NewNode("mul_weight")
-                             ->AsInput()
-                             ->assert_is_op_nth_input("mul", "Y", 0);
-  // x
-  auto* mul_tmp_var = pattern->NewNode("mul_tmp_var")
-                          ->AsInput()
-                          ->assert_is_op_nth_input("mul", "X", 0);
-  // intermediate variable, will be removed in the IR after fuse.
-  auto* mul_out_var = pattern->NewNode("mul_out")
-                          ->AsIntermediate()
-                          ->assert_is_only_output_of_op("mul")
-                          ->assert_is_op_input("elementwise_add");
-  // bias
-  auto* elementwise_add_tmp_var = pattern->NewNode("elementwise_add_tmpvar")
-                                      ->assert_is_op_input("elementwise_add")
-                                      ->AsInput();
-  // output
-  auto* elementwise_add_out_var = pattern->NewNode("elementwise_add_out")
-                                      ->AsOutput()
-                                      ->assert_is_op_output("elementwise_add");
-
-  mul_op->LinksFrom({mul_weight_var, mul_tmp_var}).LinksTo({mul_out_var});
-  elementwise_add_op->LinksFrom({mul_out_var, elementwise_add_tmp_var})
-      .LinksTo({elementwise_add_out_var});
-}
-
-// Replace the node `from` in the links to `to`
-bool LinksReplace(std::vector<Node*>* links, Node* from, Node* to) {
-  for (auto*& n : *links) {
-    if (n == from) {
-      n = to;
-      return true;
-    }
-  }
-  return false;
-}
-
 std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("fc", graph.get());
+  FusePassBase::Init("fc_fuse", graph.get());

  std::unordered_set<Node*> nodes2delete;

  GraphPatternDetector gpd;
-  BuildFCPattern(gpd.mutable_pattern());
+  // BuildFCPattern(gpd.mutable_pattern());
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("fc_fuse/x")
+                ->AsInput()
+                ->assert_is_op_input("mul", "X");
+  patterns::FC(gpd.mutable_pattern(), "fc_fuse", x, true /*with bias*/);

 #define GET_NODE(id)                                                         \
-  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode(#id)), \
+  PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetrieveNode("fc_fuse/" #id)), \
                 "pattern has no Node called %s", #id);                      \
-  auto* id = subgraph.at(gpd.pattern().RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+  auto* id = subgraph.at(gpd.pattern().RetrieveNode("fc_fuse/" #id));        \
+  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", "fc_fuse/" #id);

  int found_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -98,10 +50,10 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
    // scenerio.
    // FC's fusion is simple, just op fuse, no need to process the
    // parameters.
-    GET_NODE(mul_tmp_var);             // x
-    GET_NODE(mul_weight);              // Y
-    GET_NODE(elementwise_add_tmpvar);  // bias
-    GET_NODE(elementwise_add_out);     // Out
+    GET_NODE(x);                // x
+    GET_NODE(w);                // Y
+    GET_NODE(fc_bias);          // bias
+    GET_NODE(fc_out);           // Out
    GET_NODE(mul);              // MUL op
    GET_NODE(elementwise_add);  // ELEMENT_ADD op
    GET_NODE(mul_out);          // tmp
@@ -109,32 +61,22 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(

    // Create an FC Node.
    OpDesc desc;
-    std::string fc_x_in = mul_tmp_var->Name();
-    std::string fc_Y_in = mul_weight->Name();
-    std::string fc_bias_in = elementwise_add_tmpvar->Name();
-    std::string fc_out = elementwise_add_out->Name();
+    std::string fc_x_in = x->Name();
+    std::string fc_Y_in = w->Name();
+    std::string fc_bias_in = fc_bias->Name();
+    std::string fc_out_out = fc_out->Name();
    desc.SetInput("Input", std::vector<std::string>({fc_x_in}));
    desc.SetInput("W", std::vector<std::string>({fc_Y_in}));
    desc.SetInput("Bias", std::vector<std::string>({fc_bias_in}));
-    desc.SetOutput("Out", std::vector<std::string>({fc_out}));
+    desc.SetOutput("Out", std::vector<std::string>({fc_out_out}));
    desc.SetType("fc");
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    fc_node->inputs =
-        std::vector<Node*>({mul_tmp_var, mul_weight, elementwise_add_tmpvar});
-    fc_node->outputs.push_back(elementwise_add_out);
-
-    // Update link relatons
-    PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node));
-    PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs,
-                                elementwise_add, fc_node));
-    PADDLE_ENFORCE(
-        LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node));
+    GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out});

-    // Drop old nodes
-    graph->RemoveNode(mul);
-    graph->RemoveNode(elementwise_add);
-    graph->RemoveNode(mul_out);  // tmp variable
+    IR_NODE_LINK_TO(x, fc_node);
+    IR_NODE_LINK_TO(w, fc_node);
+    IR_NODE_LINK_TO(fc_bias, fc_node);
+    IR_NODE_LINK_TO(fc_node, fc_out);

    found_fc_count++;
  };

--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -11,39 +11,39 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"

 namespace paddle {
 namespace framework {
 namespace ir {

-std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  GraphPatternDetector gpd;
-  auto* pattern = gpd.mutable_pattern();
-
-  std::unordered_set<int> fused_ops({// first lstm
-                                     13, 15, 16,
-                                     // second lstm
-                                     23, 25, 26});
-
-  pattern->NewNode([&](Node* x) { return fused_ops.count(x->id()); },
-                   "any_node");
+std::string GenNodeName(const std::string& prefix, const std::string& name) {
+  return prefix + "/" + name;
+}

-  std::unordered_set<Node*> marked_nodes;
+void BuildPattern(PDPattern* pattern, const std::string& name_scope,
+                  bool with_fc_bias) {
+  PDNode* x = pattern->NewNode(name_scope, "x")
+                  ->assert_is_op_input("mul")
+                  ->assert_var_not_persistable();
+  auto* fc_out = patterns::FC(pattern, name_scope, x, with_fc_bias);
+  fc_out->AsIntermediate();  // fc_out is a tmp var, will be removed after fuse.
+  patterns::LSTM(pattern, name_scope, fc_out);
+  // LOG(INFO) << "\n" << pattern->DotString();
+}

-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
+int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
+                bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();

-    auto* id = subgraph.at(gpd.pattern().RetrieveNode("any_node"));
-    marked_nodes.insert(id);
-  };
-  gpd(graph.get(), handler);
+  BuildPattern(pattern, name_scope, with_fc_bias);

  // Create New OpDesc
  auto lstm_creator = [&](int lstm, int input, int weight_x, int weight_h,
-                          int bias, int hidden, int cell, int xx) {
+                          int bias, int hidden, int cell, int xx, int fc_bias) {
 #define GET_NODE(x) auto* x##_n = graph->RetriveNode(x);
    GET_NODE(input);
    GET_NODE(weight_x);
@@ -61,61 +61,147 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
    SET_IN(WeightX, weight_x);
    SET_IN(WeightH, weight_h);
    SET_IN(Bias, bias);
-#undef GET_NODE
 #undef SET_IN
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias and create a new weight
+      PADDLE_ENFORCE(scope);
+      const std::string& new_bias_var = name_scope + "_bias.new";
+      auto* bias_var = scope->Var(new_bias_var);
+      PADDLE_ENFORCE(bias_var);
+      auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
+      auto* lstm_bias_var = scope->FindVar(bias_n->Name());
+      PADDLE_ENFORCE(lstm_bias_var);
+      const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+      bias_tensor->Resize(lstm_bias_tensor.dims());
+
+      GET_NODE(fc_bias);
+      auto* fc_bias_var = scope->FindVar(fc_bias_n->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+
+      auto* data = bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+      for (int i = 0; i < bias_tensor->numel(); i++) {
+        data[i] =
+            fc_bias_tensor.data<float>()[i] + lstm_bias_tensor.data<float>()[i];
+      }
+      op_desc.SetInput("Bias", {new_bias_var});
+    }
+#undef GET_NODE

-    VLOG(4) << "hidden_n: " << hidden_n->Name();
-    VLOG(4) << "cell: " << cell_n->Name();
-    VLOG(4) << "xx: " << xx_n->Name();
+    // Create temp variables.
+    scope->Var(name_scope + "/BatchedInput.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchCellPreAct.new")
+        ->GetMutable<framework::LoDTensor>();
+    scope->Var(name_scope + "/BatchedGate.new")
+        ->GetMutable<framework::LoDTensor>();

    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
    op_desc.SetOutput("Hidden", {hidden_n->Name()});
    op_desc.SetOutput("Cell", {cell_n->Name()});
    op_desc.SetOutput("XX", {xx_n->Name()});
-    op_desc.SetOutput("BatchedGate", {"blstm_0.tmp_2"});
-    op_desc.SetOutput("BatchCellPreAct", {"blstm_1.tmp_2"});
+    op_desc.SetOutput("BatchedGate", {name_scope + "/BatchedGate.new"});
+    op_desc.SetOutput("BatchCellPreAct", {name_scope + "/BatchCellPreAct.new"});
+    op_desc.SetOutput("BatchedInput", {name_scope + "/BatchedInput.new"});
    op_desc.SetAttr("is_reverse", lstm_n->Op()->GetAttr("is_reverse"));
-    op_desc.SetAttr("use_peepholes", false);
-    auto* op = graph->CreateOpNode(&op_desc);
+    op_desc.SetAttr("use_peepholes", lstm_n->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+#define TMP_NAME(x) "at.new.tmp." #x
+#define OP_SET_OUT(x) op_desc.SetOutput(#x, {TMP_NAME(x)})
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT

-#define LINK_TO(a, b)      \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    LINK_TO(input_n, op);
-    LINK_TO(weight_x_n, op);
-    LINK_TO(weight_h_n, op);
-    LINK_TO(bias_n, op);
-    LINK_TO(op, hidden_n);
-#undef LINK_TO
+    auto* op = graph->CreateOpNode(&op_desc);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+
+#define TMP_NEW(x) scope->Var(TMP_NAME(x))->GetMutable<LoDTensor>()
+    TMP_NEW(BatchedCell);
+    TMP_NEW(BatchedHidden);
+    TMP_NEW(ReorderedH0);
+    TMP_NEW(ReorderedC0);
+#undef TMP_NEW
+#undef TMP_NAME
+
+    IR_NODE_LINK_TO(input_n, op);
+    IR_NODE_LINK_TO(weight_x_n, op);
+    IR_NODE_LINK_TO(weight_h_n, op);
+    IR_NODE_LINK_TO(bias_n, op);
+    IR_NODE_LINK_TO(op, hidden_n);
    return op;
+  };

+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+#define GET_NODE(name__)                                \
+  std::string name__##key = name_scope + "/" + #name__; \
+  auto* name__##n = pattern->RetrieveNode(name__##key); \
+  PADDLE_ENFORCE(name__##n);                            \
+  PADDLE_ENFORCE(subgraph.count(name__##n));            \
+  Node* name__##_n = subgraph.at(name__##n);            \
+  int name__ __attribute__((unused)) = name__##_n->id();
+
+    GET_NODE(x);
+    GET_NODE(w);
+    GET_NODE(mul);
+    GET_NODE(fc_out);
+    GET_NODE(Weight);
+    GET_NODE(lstm);
+    GET_NODE(Bias);
+    GET_NODE(Hidden);
+    GET_NODE(Cell);
+
+    if (with_fc_bias) {
+      GET_NODE(fc_bias);
+      GET_NODE(elementwise_add);
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes(
+          {mul_n, lstm_n, elementwise_add_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      lstm_creator(lstm, x, w, Weight, Bias, Hidden, Cell, fc_out, -1);
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes({mul_n, lstm_n});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+#undef GET_NODE
+
+    ++fusion_count;
  };

-  lstm_creator(16, 12, 14, 18, 17, 22, 21, 19);
-  lstm_creator(26, 12, 24, 28, 27, 32, 31, 29);
+  gpd(graph, handler);

-  // remove all the nodes
+  return fusion_count;
+}

-  for (auto* node : marked_nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+std::unique_ptr<ir::Graph> MulLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());

-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (marked_nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 false /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);

+  AddStatis(fusion_count);
  return graph;
 }

@@ -123,4 +209,5 @@ std::unique_ptr<ir::Graph> FCLstmFusePass::ApplyImpl(
 }  // namespace framework
 }  // namespace paddle

+REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h
@@ -12,20 +12,36 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
 namespace framework {
 namespace ir {

-class FCLstmFusePass : public Pass {
+// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op.
+
+// Just FC without bias
+class FCLstmFusePass : public FusePassBase {
 public:
  virtual ~FCLstmFusePass() {}

 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"fc_lstm_fuse"};
+};
+
+class MulLstmFusePass : public FusePassBase {
+ public:
+  virtual ~MulLstmFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+  const std::string name_scope_{"fc_nobias_lstm_fuse"};
 };

 }  // namespace ir

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -167,7 +167,6 @@ class Graph {
  std::map<std::string, std::function<void(void)>> attr_dels_;
  std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
  std::unordered_set<ir::Node *> node_set_;
-  int node_count_{0};
 };

 bool IsControlDepVar(const ir::Node &var);

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -71,7 +72,10 @@ void PDPattern::AddEdge(PDNode* a, PDNode* b) {

 void GraphPatternDetector::operator()(Graph* graph,
                                      GraphPatternDetector::handle_t handler) {
-  if (!MarkPDNodesInGraph(*graph)) return;
+  if (!MarkPDNodesInGraph(*graph)) {
+    return;
+  }
+
  auto subgraphs = DetectPatterns();
  UniquePatterns(&subgraphs);
  RemoveOverlappedMatch(&subgraphs);
@@ -81,13 +85,13 @@ void GraphPatternDetector::operator()(Graph* graph,
  LOG(INFO) << "detect " << subgraphs.size() << " subgraph matches the pattern";
  int id = 0;
  for (auto& g : subgraphs) {
-    LOG(INFO) << "optimizing #" << id++ << " subgraph";
+    VLOG(3) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }

 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
-  VLOG(4) << "mark pdnodes in graph";
+  VLOG(3) << "mark pdnodes in graph";
  if (graph.Nodes().empty()) return false;

  for (auto& node : GraphTraits::DFS(graph)) {
@@ -106,7 +110,13 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
      return false;
    }
  }
+  for (auto& item : pdnodes2nodes_) {
+    for (auto& n : item.second) {
+      GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n);
+    }
+  }
  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+
  return !pdnodes2nodes_.empty();
 }

@@ -272,7 +282,7 @@ void GraphPatternDetector::RemoveOverlappedMatch(
  for (const auto& subgraph : *subgraphs) {
    bool valid = true;
    for (auto& item : subgraph) {
-      if (node_set.count(item.second)) {
+      if (item.first->IsIntermediate() && node_set.count(item.second)) {
        valid = false;
        break;
      }
@@ -328,22 +338,22 @@ PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) {
 }

 PDNode* PDNode::assert_is_op() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsOp(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsOp(); });
  return this;
 }
 PDNode* PDNode::assert_is_op(const std::string& op_type) {
-  asserts_.emplace_back([this, op_type](Node* x) {
+  asserts_.emplace_back([op_type](Node* x) {
    return x && x->IsOp() && x->Op()->Type() == op_type;
  });
  return this;
 }
 PDNode* PDNode::assert_is_var() {
-  asserts_.emplace_back([this](Node* x) { return x && x->IsVar(); });
+  asserts_.emplace_back([](Node* x) { return x && x->IsVar(); });
  return this;
 }
 PDNode* PDNode::assert_var_not_persistable() {
  assert_is_var();
-  asserts_.emplace_back([this](Node* x) { return !x->Var()->Persistable(); });
+  asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); });
  return this;
 }
 PDNode* PDNode::assert_is_persistable_var() {
@@ -357,7 +367,9 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
  assert_is_op_input(op_type);
  asserts_.emplace_back([=](Node* x) {
    for (auto* op : x->outputs) {
-      if (IsNthInput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthInput(x, op, argument, nth))
+        return true;
    }
    return false;
  });
@@ -368,7 +380,9 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
  assert_is_var();
  asserts_.emplace_back([=](Node* x) {
    for (auto* op : x->inputs) {
-      if (IsNthOutput(x, op, argument, nth)) return true;
+      if (op->IsOp() && op->Op()->Type() == op_type &&
+          IsNthOutput(x, op, argument, nth))
+        return true;
    }
    return false;
  });
@@ -412,6 +426,12 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
  });
  return this;
 }
+PDNode* PDNode::assert_is_op_output(const std::string& op_type,
+                                    const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_output(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
  assert_is_var();
  asserts_.emplace_back([=](Node* x) {
@@ -424,6 +444,12 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
  });
  return this;
 }
+PDNode* PDNode::assert_is_op_input(const std::string& op_type,
+                                   const std::string& argument) {
+  assert_is_var();
+  assert_is_op_nth_input(op_type, argument, 0);
+  return this;
+}
 PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
  assert_is_op(op_type);
  asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; });
@@ -439,6 +465,130 @@ PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
  return this;
 }

+bool VarLinksToOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->outputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Input(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Input(argument)[nth];
+}
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
+  PADDLE_ENFORCE(var->IsVar());
+  PADDLE_ENFORCE(op->IsOp());
+  if (op->Op()->Output(argument).size() <= nth) return false;
+  return var->Name() == op->Op()->Output(argument)[nth];
+}
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes) {
+  for (auto* node : nodes) {
+    graph->RemoveNode(const_cast<Node*>(node));
+  }
+
+  for (auto* node : graph->Nodes()) {
+    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->inputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
+      if (nodes.count(*it)) {
+        it = const_cast<Node*>(node)->outputs.erase(it);
+      } else {
+        it++;
+      }
+    }
+  }
+}
+bool VarLinksFromOp(Node* node, const std::string& op_type) {
+  for (auto* out : node->inputs) {
+    if (out->IsOp() && out->Op()->Type() == op_type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PDNode* patterns::FC(PDPattern* pattern, const std::string& name_scope,
+                     PDNode* x, bool with_bias) {
+  // Create Operators
+  PDNode* elementwise_add_op{nullptr};
+  auto* mul_op = pattern->NewNode(name_scope, "mul")->assert_is_op("mul");
+  if (with_bias) {
+    elementwise_add_op = pattern->NewNode(name_scope, "elementwise_add")
+                             ->assert_is_op("elementwise_add");
+  }
+  // Create variables
+  // w
+  auto* mul_weight_var = pattern->NewNode(name_scope, "w")
+                             ->AsInput()
+                             ->assert_is_persistable_var()
+                             ->assert_is_op_nth_input("mul", "Y", 0);
+  PDNode* mul_out_var{nullptr};
+  if (with_bias) {
+    // intermediate variable, will be removed in the IR after fuse.
+    mul_out_var = pattern->NewNode(name_scope, "mul_out")
+                      ->AsIntermediate()
+                      ->assert_is_only_output_of_op("mul")
+                      ->assert_is_op_input("elementwise_add");
+  }
+  PDNode *bias{nullptr}, *fc_out{nullptr};
+  if (with_bias) {
+    // bias
+    bias = pattern->NewNode(name_scope, "fc_bias")
+               ->assert_is_op_input("elementwise_add")
+               ->AsInput();
+    // output
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("elementwise_add");
+  } else {
+    fc_out = pattern->NewNode(name_scope, "fc_out")
+                 ->AsOutput()
+                 ->assert_is_op_output("mul");
+  }
+
+  if (with_bias) {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({mul_out_var});
+    elementwise_add_op->LinksFrom({mul_out_var, bias}).LinksTo({fc_out});
+  } else {
+    mul_op->LinksFrom({mul_weight_var, x}).LinksTo({fc_out});
+  }
+
+  return fc_out;
+}
+PDNode* patterns::LSTM(PDPattern* pattern, const std::string& name_scope,
+                       PDNode* x) {
+  x->assert_is_op_input("lstm", "Input");
+  auto* lstm_op = pattern->NewNode(name_scope, "lstm")->assert_is_op("lstm");
+#define NEW_NODE(arg__, io__)                        \
+  auto* arg__ = pattern->NewNode(name_scope, #arg__) \
+                    ->assert_is_op_##io__("lstm", #arg__);
+
+  // Currently, the H0 and C0 are optional
+  // TODO(Superjomn) upgrade the fuse framework to support optional.
+  // NEW_NODE(H0, input);
+  // NEW_NODE(C0, input);
+  NEW_NODE(Weight, input);
+  NEW_NODE(Bias, input);
+
+  NEW_NODE(Hidden, output);
+  NEW_NODE(Cell, output);
+  NEW_NODE(BatchGate, output);
+  NEW_NODE(BatchCellPreAct, output);
+
+  lstm_op->LinksFrom({x, Weight, Bias});
+  lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct});
+  return Hidden;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -19,6 +19,9 @@
 #endif

 #include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -95,7 +98,11 @@ struct PDNode {
  PDNode* assert_var_not_persistable();
  PDNode* assert_is_persistable_var();
  PDNode* assert_is_op_output(const std::string& op_type);
+  PDNode* assert_is_op_output(const std::string& op_type,
+                              const std::string& argument);
  PDNode* assert_is_op_input(const std::string& op_type);
+  PDNode* assert_is_op_input(const std::string& op_type,
+                             const std::string& argument);
  PDNode* assert_is_op_nth_input(const std::string& op_type,
                                 const std::string& argument, int nth);
  PDNode* assert_is_op_nth_output(const std::string& op_type,
@@ -167,6 +174,9 @@ class PDPattern {

  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID());
  PDNode* NewNode(const std::string& name = NewID());
+  PDNode* NewNode(const std::string& prefix, const std::string& name) {
+    return NewNode(prefix + "/" + name);
+  }
  PDNode* RetrieveNode(const std::string& id) const;

  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
@@ -238,6 +248,8 @@ class GraphPatternDetector {
  void UniquePatterns(std::vector<subgraph_t>* subgraphs);

  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  // The intermediate PDNodes will be removed, so can't shared by multiple
+  // patterns.
  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);

  // Validate whether the intermediate nodes are linked by external nodes.
@@ -257,64 +269,40 @@ class GraphPatternDetector {

 // some helper methods.

-// Op's input.
-static bool VarLinksToOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->outputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if a var links to an Op
+bool VarLinksToOp(Node* node, const std::string& op_type);

-// Op's output.
-static bool VarLinksFromOp(Node* node, const std::string& op_type) {
-  for (auto* out : node->inputs) {
-    if (out->IsOp() && out->Op()->Type() == op_type) {
-      return true;
-    }
-  }
-  return false;
-}
+// Tell if an op links to a var
+bool VarLinksFromOp(Node* node, const std::string& op_type);

 // Check whether a var node is a op node's nth input.
-static bool IsNthInput(Node* var, Node* op, const std::string& argument,
-                       size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Input(argument)[nth];
-}
-
-static bool IsNthOutput(Node* var, Node* op, const std::string& argument,
-                        size_t nth) {
-  PADDLE_ENFORCE(var->IsVar());
-  PADDLE_ENFORCE(op->IsOp());
-  if (op->inputs.size() <= nth) return false;
-  return var->Name() == op->Op()->Output(argument)[nth];
-}
-
-static void GraphSafeRemoveNodes(Graph* graph,
-                                 const std::unordered_set<const Node*>& nodes) {
-  for (auto* node : nodes) {
-    graph->RemoveNode(const_cast<Node*>(node));
-  }
+bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth);

-  for (auto* node : graph->Nodes()) {
-    for (auto it = node->inputs.begin(); it != node->inputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->inputs.erase(it);
-      } else
-        it++;
-    }
-    for (auto it = node->outputs.begin(); it != node->outputs.end();) {
-      if (nodes.count(*it)) {
-        it = const_cast<Node*>(node)->outputs.erase(it);
-      } else
-        it++;
-    }
-  }
-}
+// Tell whether a var node is a op node's nth output.
+bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth);
+
+// Graph safely remove some nodes, will automatically clean up the edges.
+void GraphSafeRemoveNodes(Graph* graph,
+                          const std::unordered_set<const Node*>& nodes);
+
+// Some pre-defined patterns those can be reused in multiple passes.
+namespace patterns {
+
+// FC with bias
+// op: mul + elementwise_add
+// named nodes:
+// mul, elementwise_add
+// w, mul_out, bias, fc_out
+PDNode* FC(PDPattern* pattern, const std::string& name_scope, PDNode* x,
+           bool with_bias);
+
+PDNode* LSTM(PDPattern* pattern, const std::string& name_scope, PDNode* x);
+
+}  // namespace patterns
+
+#define IR_NODE_LINK_TO(a, b) \
+  a->outputs.push_back(b);    \
+  b->inputs.push_back(a);

 }  // namespace ir
 }  // namespace framework

--- a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc
@@ -140,8 +140,9 @@ TEST(GraphPatternDetecter, MultiSubgraph) {
        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
      },
      "OP0");
-  auto* any_var = x.mutable_pattern()->NewNode(
-      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_var = x.mutable_pattern()
+                      ->NewNode([](Node* node) { return node->IsVar(); }, "VAR")
+                      ->AsIntermediate();
  auto* any_op1 = x.mutable_pattern()->NewNode(
      [](Node* node) { return node->IsOp(); }, "OP1");


--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -50,20 +50,37 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(

  Dot dot;

-  std::vector<Dot::Attr> op_attrs({Dot::Attr("style", "filled"),
-                                   Dot::Attr("shape", "box"),
-                                   Dot::Attr("fillcolor", "red")});
-  std::vector<Dot::Attr> var_attrs({Dot::Attr("style", "filled,rounded"),
-                                    // Dot::Attr("shape", "diamond"),
+  const std::vector<Dot::Attr> op_attrs({
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("color", "#303A3A"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+      Dot::Attr("width", "1.3"),                  //
+      Dot::Attr("height", "0.84"),                //
+      Dot::Attr("fontname", "Arial"),             //
+  });
+  const std::vector<Dot::Attr> arg_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("fillcolor", "#999999"),          //
+      Dot::Attr("color", "#dddddd"),              //
+  });
+
+  const std::vector<Dot::Attr> param_attrs({
+      Dot::Attr("shape", "box"),                  //
+      Dot::Attr("style", "rounded,filled,bold"),  //
+      Dot::Attr("fontname", "Arial"),             //
+      Dot::Attr("color", "#148b97"),              //
+      Dot::Attr("fontcolor", "#ffffff"),          //
+  });
+
+  const std::vector<Dot::Attr> marked_op_attrs(
+      {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"),
+       Dot::Attr("fillcolor", "yellow")});
+  const std::vector<Dot::Attr> marked_var_attrs(
+      {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"),
       Dot::Attr("fillcolor", "yellow")});
-
-  std::vector<Dot::Attr> marked_op_attrs({Dot::Attr("style", "filled"),
-                                          Dot::Attr("shape", "box"),
-                                          Dot::Attr("fillcolor", "lightgray")});
-  std::vector<Dot::Attr> marked_var_attrs(
-      {Dot::Attr("style", "filled,rounded"),
-       // Dot::Attr("shape", "diamond"),
-       Dot::Attr("fillcolor", "lightgray")});

  auto marked_nodes = ConsumeMarkedNodes(graph.get());
  // Create nodes
@@ -74,9 +91,17 @@ std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
          marked_nodes.count(n) ? marked_op_attrs : op_attrs;
      dot.AddNode(node_id, attr, node_id);
    } else if (n->IsVar()) {
-      decltype(op_attrs) attr =
-          marked_nodes.count(n) ? marked_var_attrs : var_attrs;
-      dot.AddNode(node_id, attr, node_id);
+      decltype(op_attrs)* attr;
+      if (marked_nodes.count(n)) {
+        attr = &marked_var_attrs;
+      } else if (const_cast<Node*>(n)->Var() &&
+                 const_cast<Node*>(n)->Var()->Persistable()) {
+        attr = &param_attrs;
+      } else {
+        attr = &arg_attrs;
+      }
+
+      dot.AddNode(node_id, *attr, node_id);
    }
    node2dot[n] = node_id;
  }

--- a/paddle/fluid/framework/ir/graph_viz_pass.h
+++ b/paddle/fluid/framework/ir/graph_viz_pass.h
@@ -42,6 +42,13 @@ class GraphVizPass : public Pass {
  marked_nodes_t ConsumeMarkedNodes(Graph* graph) const;
 };

+static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) {
+  if (!graph->Has(kGraphvizMarkedNodeAttr)) {
+    graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t);
+  }
+  return graph->Get<GraphVizPass::marked_nodes_t>(kGraphvizMarkedNodeAttr);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
+++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc
@@ -13,42 +13,41 @@
 // limitations under the License.

 #include <algorithm>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"

 namespace paddle {
 namespace framework {
 namespace ir {

-class InferCleanGraphPass : public Pass {
+class InferCleanGraphPass : public FusePassBase {
 public:
  virtual ~InferCleanGraphPass() {}

 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const {
+    FusePassBase::Init("original_graph", graph.get());
    PADDLE_ENFORCE(graph.get());

    auto is_valid_node = [](Node* x) {
      return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
    };

-    std::unordered_set<Node*> invalid_nodes;
+    std::unordered_set<const Node*> invalid_nodes;
+    int valid_op = 0;
    for (auto* node : graph->Nodes()) {
      if (is_valid_node(node)) {
        invalid_nodes.insert(node);
+      } else if (node->IsOp()) {
+        // Collect all the operators to help tracking number of operators.
+        ++valid_op;
      }
    }

-    // remove nodes from the graph.
-    for (auto* node : invalid_nodes) {
-      graph->RemoveNode(node);
-    }
+    GraphSafeRemoveNodes(graph.get(), invalid_nodes);

-    // clean edges.
-    for (auto* node : graph->Nodes()) {
-      CleanEdges(&node->inputs, invalid_nodes);
-      CleanEdges(&node->outputs, invalid_nodes);
-    }
+    AddStatis(valid_op);

    return graph;
  }

--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -219,16 +219,13 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
    op_desc.SetAttr("fc_activation", act->Op()->Type());

    auto* op_node = graph->CreateOpNode(&op_desc);
-// Add links
-#define NODE_LINKS(a, b)   \
-  a->outputs.push_back(b); \
-  b->inputs.push_back(a);
-    NODE_LINKS(fc_w, op_node);
-    NODE_LINKS(fc_bias, op_node);
-    NODE_LINKS(concat_in0, op_node);
-    NODE_LINKS(sequence_expand0_in, op_node);
-    NODE_LINKS(sequence_expand1_in, op_node);
-    NODE_LINKS(op_node, fc_out);
+    // Add links
+    IR_NODE_LINK_TO(fc_w, op_node);
+    IR_NODE_LINK_TO(fc_bias, op_node);
+    IR_NODE_LINK_TO(concat_in0, op_node);
+    IR_NODE_LINK_TO(sequence_expand0_in, op_node);
+    IR_NODE_LINK_TO(sequence_expand1_in, op_node);
+    IR_NODE_LINK_TO(op_node, fc_out);

    // Clean nodes.
    std::unordered_set<const Node*> marked_nodes;
@@ -241,7 +238,6 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(
    marked_nodes.erase(sequence_expand0_in);
    marked_nodes.erase(sequence_expand1_in);
    marked_nodes.erase(fc_out);
-
    GraphSafeRemoveNodes(graph, marked_nodes);
  });


--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -10,19 +10,19 @@ set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor)
 # TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
    SRCS io.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} graph_to_program_pass)
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) 

 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)

 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)

-if(NOT APPLE)
+#if(APPLE)
  add_subdirectory(api)
-endif()
+#endif()

 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
 if(NOT APPLE)
  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -32,6 +32,7 @@ endif()
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
    DEPS ${fluid_modules} paddle_fluid_api)

 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -25,17 +25,16 @@ function (inference_analysis_test TARGET)
    if(WITH_TESTING)
        set(options "")
        set(oneValueArgs "")
-        set(multiValueArgs SRCS EXTRA_DEPS)
+        set(multiValueArgs SRCS ARGS EXTRA_DEPS)
        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
        set(mem_opt "")
        if(WITH_GPU)
            set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
        endif()
        cc_test(${TARGET}
                SRCS "${analysis_test_SRCS}"
-                DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detector pass ${analysis_test_EXTRA_DEPS}
-                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
+                DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS})
        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
    endif(WITH_TESTING)
 endfunction(inference_analysis_test)
@@ -51,32 +50,19 @@ endfunction(inference_download_and_uncompress)
 set(DITU_RNN_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fmodel.tar.gz")
 set(DITU_RNN_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/ditu_rnn_fluid%2Fdata.txt.tar.gz")
 set(DITU_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/ditu_rnn" CACHE PATH "Ditu RNN model and data root." FORCE)
-if (NOT EXISTS ${DITU_INSTALL_DIR})
+if (NOT EXISTS ${DITU_INSTALL_DIR} AND WITH_TESTING)
  inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_MODEL_URL} "ditu_rnn_fluid%2Fmodel.tar.gz")
  inference_download_and_uncompress(${DITU_INSTALL_DIR} ${DITU_RNN_DATA_URL} "ditu_rnn_fluid%2Fdata.txt.tar.gz")
 endif()

 inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis
-    analysis_predictor
-		# ir
-		fc_fuse_pass
-		fc_lstm_fuse_pass
-    seq_concat_fc_fuse_pass
-		graph_viz_pass
-		infer_clean_graph_pass
-		graph_pattern_detector
-    infer_clean_graph_pass
-    attention_lstm_fuse_pass
-    paddle_inference_api
-		pass
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
+    ARGS --infer_ditu_rnn_model=${DITU_INSTALL_DIR}/model
         --infer_ditu_rnn_data=${DITU_INSTALL_DIR}/data.txt)

 inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
-inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc EXTRA_DEPS paddle_inference_api)
-inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc EXTRA_DEPS paddle_fluid)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc)
 inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
 inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
 inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
@@ -88,13 +74,37 @@ inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc)
 set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
 set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
 set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner" CACHE PATH "Chinese ner model and data root." FORCE)
-if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR})
+if (NOT EXISTS ${CHINESE_NER_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} "chinese_ner_model.tar.gz")
  inference_download_and_uncompress(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_DATA_URL} "chinese_ner-data.txt.tar.gz")
 endif()

-inference_analysis_test(test_chinese_ner SRCS chinese_ner_tester.cc
+inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
    EXTRA_DEPS paddle_inference_api paddle_fluid_api
-    ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model
-        --infer_model=${CHINESE_NER_INSTALL_DIR}/model
+    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+
+set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
+set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
+set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac" CACHE PATH "LAC model and data root." FORCE)
+if (NOT EXISTS ${LAC_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} "lac_model.tar.gz")
+    inference_download_and_uncompress(${LAC_INSTALL_DIR} ${LAC_DATA_URL} "lac_data.txt.tar.gz")
+endif()
+
+inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api
+    ARGS --infer_model=${LAC_INSTALL_DIR}/model
+        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+
+
+set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification" CACHE PATH "Text Classification model and data root." FORCE)
+
+if (NOT EXISTS ${TEXT_CLASSIFICATION_INSTALL_DIR} AND WITH_TESTING AND WITH_INFERENCE)
+  inference_download_and_uncompress(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} "text-classification-Senta.tar.gz")
+endif()
+
+inference_analysis_test(test_text_classification SRCS test_text_classification.cc
+    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
+    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta)
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -41,20 +42,16 @@ class DfgPassManagerImpl final : public DfgPassManager {
 public:
  DfgPassManagerImpl() {
    // TODO(Superjomn) set the key with pass reprs.
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
-    if (FLAGS_IA_enable_ir) {
-      AddPass("fluid-to-ir-pass", new FluidToIrPass);
-    } else {
+    if (!FLAGS_IA_enable_ir) {
      AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    } else {
+      AddPass("fluid-to-ir-pass", new FluidToIrPass);
    }
    TryAddTensorRtPass();
    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
    if (!FLAGS_IA_output_storage_path.empty()) {
      AddPass("model-store-pass", new ModelStorePass);
    }
-    LOG(INFO)
-        << "-----------------------------------------------------------------";
  }

  std::string repr() const override { return "dfg-pass-manager"; }
@@ -101,18 +98,15 @@ class DfgPassManagerImpl final : public DfgPassManager {
 Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }

 void Analyzer::Run(Argument* argument) {
-  // Ugly support fluid-to-ir-pass
-  argument->Set(kFluidToIrPassesAttr,
-                new std::vector<std::string>({
-                    // Manual update the passes here.
-                    "graph_viz_pass",                              //
-                    "infer_clean_graph_pass", "graph_viz_pass",    //
-                    "attention_lstm_fuse_pass", "graph_viz_pass",  //
-                    "fc_lstm_fuse_pass", "graph_viz_pass",         //
-                    "seq_concat_fc_fuse_pass", "graph_viz_pass",   //
-                    "fc_fuse_pass", "graph_viz_pass"               //
-
-                }));
+  std::vector<std::string> passes;
+  for (auto& pass : all_ir_passes_) {
+    if (!disabled_ir_passes_.count(pass)) {
+      passes.push_back(pass);
+      passes.push_back("graph_viz_pass");  // add graphviz for debug.
+    }
+  }
+  passes.push_back("graph_viz_pass");
+  argument->Set(kFluidToIrPassesAttr, new std::vector<std::string>(passes));

  for (auto& x : data_) {
    PADDLE_ENFORCE(x->Initialize(argument));
@@ -121,6 +115,11 @@ void Analyzer::Run(Argument* argument) {
  }
 }

+Analyzer& Analyzer::DisableIrPasses(const std::vector<std::string>& passes) {
+  disabled_ir_passes_.insert(passes.begin(), passes.end());
+  return *this;
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -36,16 +36,10 @@ limitations under the License. */
 */

 #include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/pass_manager.h"

-// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
-// flag if not available.
-DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
-DECLARE_string(IA_graphviz_log_root);
-DECLARE_string(IA_output_storage_path);
-DECLARE_bool(IA_enable_ir);
-
 namespace paddle {
 namespace inference {
 namespace analysis {
@@ -57,7 +51,26 @@ class Analyzer : public OrderedRegistry<PassManager> {

  void Run(Argument* argument);

+  Analyzer& DisableIrPasses(const std::vector<std::string>& passes);
+
  DISABLE_COPY_AND_ASSIGN(Analyzer);
+
+ private:
+  // All avaiable IR passes.
+  // The bigger fuse comes first, so that the small operators prefer to be
+  // merged in a larger fuse op. The small fusion will not break the pattern of
+  // larger fusion.
+  const std::vector<std::string> all_ir_passes_{{
+      // Manual update the passes here.
+      "infer_clean_graph_pass",    //
+      "attention_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",         //
+      "mul_lstm_fuse_pass",        //
+      "seq_concat_fc_fuse_pass",   //
+      "fc_fuse_pass",              //
+  }};
+
+  std::unordered_set<std::string> disabled_ir_passes_;
 };

 }  // namespace analysis

--- a/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_lac_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path for LAC");
+DEFINE_string(infer_data, "", "data file for LAC");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct DataRecord {
+  std::vector<int64_t> data;
+  std::vector<size_t> lod;
+  // for dataset and nextbatch
+  size_t batch_iter{0};
+  std::vector<std::vector<size_t>> batched_lods;
+  std::vector<std::vector<int64_t>> batched_datas;
+  std::vector<std::vector<int64_t>> datasets;
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1) {
+    Load(path);
+    Prepare(batch_size);
+    batch_iter = 0;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    datasets.resize(0);
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ';', &data);
+      std::vector<int64_t> words_ids;
+      split_to_int64(data[1], ' ', &words_ids);
+      datasets.emplace_back(words_ids);
+    }
+  }
+  void Prepare(int bs) {
+    if (bs == 1) {
+      batched_datas = datasets;
+      for (auto one_sentence : datasets) {
+        batched_lods.push_back({0, one_sentence.size()});
+      }
+    } else {
+      std::vector<int64_t> one_batch;
+      std::vector<size_t> lod{0};
+      int bs_id = 0;
+      for (auto one_sentence : datasets) {
+        bs_id++;
+        one_batch.insert(one_batch.end(), one_sentence.begin(),
+                         one_sentence.end());
+        lod.push_back(lod.back() + one_sentence.size());
+        if (bs_id == bs) {
+          bs_id = 0;
+          batched_datas.push_back(one_batch);
+          batched_lods.push_back(lod);
+          one_batch.clear();
+          one_batch.resize(0);
+          lod.clear();
+          lod.resize(0);
+          lod.push_back(0);
+        }
+      }
+      if (one_batch.size() != 0) {
+        batched_datas.push_back(one_batch);
+        batched_lods.push_back(lod);
+      }
+    }
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    data.data = batched_datas[batch_iter];
+    data.lod = batched_lods[batch_iter];
+    batch_iter++;
+    if (batch_iter >= batched_datas.size()) {
+      batch_iter = 0;
+    }
+    return data;
+  }
+};
+void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                 int batch_size) {
+  auto one_batch = data->NextBatch();
+  PaddleTensor input_tensor;
+  input_tensor.name = "word";
+  input_tensor.shape.assign({static_cast<int>(one_batch.data.size()), 1});
+  input_tensor.lod.assign({one_batch.lod});
+  input_tensor.dtype = PaddleDType::INT64;
+  TensorAssignData<int64_t>(&input_tensor, {one_batch.data});
+  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  input_slots->assign({input_tensor});
+}
+static void PrintTime(const double latency, const int bs, const int repeat) {
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
+            << ", avg latency: " << latency / repeat << "ms";
+  LOG(INFO) << "=====================================";
+}
+void BenchAllData(const std::string &model_path, const std::string &data_file,
+                  const int batch_size, const int repeat) {
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  GetOneBatch(&input_slots, &data, batch_size);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  double sum = 0;
+  for (int i = 0; i < repeat; i++) {
+    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
+      GetOneBatch(&input_slots, &data, batch_size);
+      timer.tic();
+      predictor->Run(input_slots, &outputs_slots);
+      sum += timer.toc();
+    }
+  }
+  PrintTime(sum, batch_size, repeat);
+}
+const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
+                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
+                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
+                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+void TestLACPrediction(const std::string &model_path,
+                       const std::string &data_file, const int batch_size,
+                       const int repeat, bool test_all_data) {
+  if (test_all_data) {
+    BenchAllData(model_path, data_file, batch_size, repeat);
+    return;
+  }
+  NativeConfig config;
+  config.model_dir = model_path;
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  for (int i = 0; i < FLAGS_burning; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < repeat; i++) {
+    predictor->Run(input_slots, &outputs_slots);
+  }
+  PrintTime(timer.toc(), batch_size, repeat);
+  EXPECT_EQ(outputs_slots.size(), 1UL);
+  auto &out = outputs_slots[0];
+  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                [](int a, int b) { return a * b; });
+  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+  PADDLE_ENFORCE_GT(size, 0);
+  EXPECT_GE(size, batch1_size);
+  int64_t *pdata = static_cast<int64_t *>(out.data.data());
+  for (size_t i = 0; i < batch1_size; ++i) {
+    EXPECT_EQ(pdata[i], lac_ref_data[i]);
+  }
+}
+TEST(Analyzer_LAC, native) {
+  LOG(INFO) << "LAC with native";
+  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
+                    FLAGS_repeat, FLAGS_test_all_data);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/chinese_ner_tester.cc
+++ b/paddle/fluid/inference/analysis/chinese_ner_tester.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -16,25 +16,27 @@

 #include <google/protobuf/text_format.h>
 #include <gtest/gtest.h>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/platform/profiler.h"

 DEFINE_string(infer_ditu_rnn_model, "", "model path for ditu RNN");
 DEFINE_string(infer_ditu_rnn_data, "", "data path for ditu RNN");
 DEFINE_int32(batch_size, 10, "batch size.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");

 namespace paddle {
 namespace inference {
 namespace analysis {

-using namespace framework;
+using namespace framework;  // NOLINT

 TEST(Analyzer, analysis_without_tensorrt) {
  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
@@ -219,39 +221,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-std::string DescribeTensor(const PaddleTensor &tensor) {
-  std::stringstream os;
-  os << "Tensor [" << tensor.name << "]\n";
-  os << " - type: ";
-  switch (tensor.dtype) {
-    case PaddleDType::FLOAT32:
-      os << "float32";
-      break;
-    case PaddleDType::INT64:
-      os << "int64";
-      break;
-    default:
-      os << "unset";
-  }
-  os << '\n';
-
-  os << " - shape: " << to_string(tensor.shape) << '\n';
-  os << " - lod: ";
-  for (auto &l : tensor.lod) {
-    os << to_string(l) << "; ";
-  }
-  os << "\n";
-  os << " - data: ";
-
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  for (int i = 0; i < dim; i++) {
-    os << static_cast<float *>(tensor.data.data())[i] << " ";
-  }
-  os << '\n';
-  return os.str();
-}
-
 }  // namespace

 const float ditu_rnn_target_data[] = {
@@ -265,57 +234,97 @@ const float ditu_rnn_target_data[] = {
    10.7286, 12.0595, 10.6672, 0,       0,       0,       0,       0,
    93.5771, 3.84641, 0,       0,       0,       0,       0,       0,
    169.426, 0,       0,       0,       0,       0,       0,       0};
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &base_outputs) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &base_out = base_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
+                                   1, [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_EQ(size, size1);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    float *base_data = static_cast<float *>(base_out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    }
+  }
+}
 // Test with a really complicate model.
-void TestDituRNNPrediction(const std::string &model_path,
-                           const std::string &data_path, int batch_size,
-                           bool use_analysis, bool activate_ir,
-                           int num_times = 1) {
-  NativeConfig config;
+void TestDituRNNPrediction(bool use_analysis, bool activate_ir,
+                           int num_threads) {
+  AnalysisConfig config;
  config.prog_file = FLAGS_infer_ditu_rnn_model + "/__model__";
  config.param_file = FLAGS_infer_ditu_rnn_model + "/param";
  config.use_gpu = false;
  config.device = 0;
  config.specify_input_name = true;
+  config.enable_ir_optim = activate_ir;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+  config.ir_passes.clear();  // Do not exclude any pass.
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;

  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
  auto predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kAnalysis>(config);
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(data_path, batch_size);
+  DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
  // Prepare inputs.
  PrepareInputs(&input_slots, &data, batch_size);
  std::vector<PaddleTensor> outputs, base_outputs;

  base_predictor->Run(input_slots, &base_outputs);

+  LOG(INFO) << "===========profile result===========";
+  if (num_threads == 1) {
+    // Prepare inputs.
    Timer timer;
    timer.tic();
    for (int i = 0; i < num_times; i++) {
      predictor->Run(input_slots, &outputs);
    }
-  LOG(INFO) << "===========profile result===========";
-  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << num_times
-            << ", latency: " << timer.toc() / num_times << "ms";
-  LOG(INFO) << "=====================================";
-
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
+    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    CompareResult(outputs, base_outputs);
+  } else {
+    std::vector<std::thread> threads;
+    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+    // because AttentionLSTM's hard code nodeid will be damanged.
+    for (int tid = 0; tid < num_threads; ++tid) {
+      predictors.emplace_back(
+          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+              config));
    }
+    for (int tid = 0; tid < num_threads; ++tid) {
+      threads.emplace_back([&, tid]() {
+        // Each thread should have local input_slots and outputs.
+        std::vector<PaddleTensor> input_slots;
+        DataRecord data(FLAGS_infer_ditu_rnn_data, batch_size);
+        PrepareInputs(&input_slots, &data, batch_size);
+        std::vector<PaddleTensor> outputs;
+        Timer timer;
+        timer.tic();
+        for (int i = 0; i < num_times; i++) {
+          predictors[tid]->Run(input_slots, &outputs);
+        }
+        PrintTime(batch_size, num_times, num_threads, tid,
+                  timer.toc() / num_times);
+        CompareResult(outputs, base_outputs);
+      });
+    }
+    for (int i = 0; i < num_threads; ++i) {
+      threads[i].join();
    }
+  }
+  LOG(INFO) << "=====================================";

  if (use_analysis && activate_ir) {
    AnalysisPredictor *analysis_predictor =
@@ -327,39 +336,45 @@ void TestDituRNNPrediction(const std::string &model_path,
      LOG(INFO) << "fused " << item.first << " " << item.second;
    }

-    ASSERT_TRUE(fuse_statis.count("fc"));
-    EXPECT_EQ(fuse_statis.at("fc"), 1);
+    int num_ops = 0;
+    for (auto &node :
+         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+      if (node->IsFunction()) {
+        ++num_ops;
      }
-}
+    }
+    LOG(INFO) << "has num ops: " << num_ops;

-// Directly infer with the original model.
-TEST(Analyzer, DituRNN_without_analysis) {
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, false, false, FLAGS_repeat);
+    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+    EXPECT_EQ(num_ops,
+              13);  // After graph optimization, only 13 operators exists.
+  }
 }

-// Inference with the original model with the analysis turned on, the analysis
-// module will transform the program to a data flow graph.
-TEST(Analyzer, DituRNN_with_analysis) {
-  LOG(INFO) << "ditu rnn with analysis";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, false, FLAGS_repeat);
+// Inference with analysis and IR, easy for profiling independently.
+TEST(Analyzer, DituRNN) {
+  TestDituRNNPrediction(true, true, FLAGS_num_threads);
 }

-// Inference with analysis and IR. The IR module will fuse some large kernels.
-TEST(Analyzer, DituRNN_with_analysis_with_IR) {
-  LOG(INFO) << "ditu rnn with analysis and IR fuse";
-  TestDituRNNPrediction(FLAGS_infer_ditu_rnn_model, FLAGS_infer_ditu_rnn_data,
-                        FLAGS_batch_size, true, true, FLAGS_repeat);
+// Other unit-tests of DituRNN, test different options of use_analysis,
+// activate_ir and multi-threads.
+TEST(Analyzer, DituRNN_tests) {
+  int num_threads[2] = {1, 4};
+  for (auto i : num_threads) {
+    // Directly infer with the original model.
+    TestDituRNNPrediction(false, false, i);
+    // Inference with the original model with the analysis turned on, the
+    // analysis
+    // module will transform the program to a data flow graph.
+    TestDituRNNPrediction(true, false, i);
+    // Inference with analysis and IR. The IR module will fuse some large
+    // kernels.
+    TestDituRNNPrediction(true, true, i);
+  }
 }

 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -67,7 +67,7 @@ struct Argument {
    PADDLE_ENFORCE(!attrs_.count(key), "Duplicate set Argument's attr [%s]",
                   key);
    attrs_[key] = data;
-    attr_deleters_[key] = [data, key, this]() {
+    attr_deleters_[key] = [data, key]() {
      VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
      VLOG(3) << "argument delete attr: " << key;
      delete data;

--- a/paddle/fluid/inference/analysis/flags.h
+++ b/paddle/fluid/inference/analysis/flags.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(IA_enable_tensorrt_subgraph_engine);
+DECLARE_string(IA_graphviz_log_root);
+DECLARE_string(IA_output_storage_path);
+DECLARE_bool(IA_enable_ir);
--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h
@@ -15,6 +15,7 @@
 #pragma once

 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/flags.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include "paddle/fluid/inference/analysis/pass.h"

@@ -85,9 +86,11 @@ class FluidToIrPass final : public DataFlowGraphPass {
          new Scope *(&argument_->Get<Scope>(ir::kParamScopeAttr)));
    }

+    if (FLAGS_IA_enable_ir) {
      const auto &ir_passes_to_apply =
          argument_->Get<std::vector<std::string>>(kFluidToIrPassesAttr);
      ir_passes.Apply(ir_passes_to_apply);
+    }

    PADDLE_ENFORCE(argument_->main_dfg.get());
    argument_->main_dfg->Build(ir_passes.graph());

--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc
@@ -16,6 +16,7 @@

 #include <gtest/gtest.h>
 #include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"

 namespace paddle {
 namespace inference {
@@ -33,10 +34,3 @@ TEST(FluidToIrPass, Test) {
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
-
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
-USE_PASS(attention_lstm_fuse_pass);
-USE_PASS(fc_lstm_fuse_pass);
-USE_PASS(seq_concat_fc_fuse_pass);
-USE_PASS(fc_fuse_pass);
--- a/paddle/fluid/inference/analysis/test_text_classification.cc
+++ b/paddle/fluid/inference/analysis/test_text_classification.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"
+
+DEFINE_string(infer_model, "", "Directory of the inference model.");
+DEFINE_string(infer_data, "", "Path of the dataset.");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "How many times to repeat run.");
+
+namespace paddle {
+
+template <typename T>
+std::string to_string(const std::vector<T> &vec) {
+  std::stringstream ss;
+  for (const auto &c : vec) {
+    ss << c << " ";
+  }
+  return ss.str();
+}
+
+void PrintTime(const double latency, const int bs, const int repeat) {
+  LOG(INFO) << "===========profile result===========";
+  LOG(INFO) << "batch_size: " << bs << ", repeat: " << repeat
+            << ", avg latency: " << latency / repeat << "ms";
+  LOG(INFO) << "=====================================";
+}
+
+void Main(int batch_size) {
+  // Three sequence inputs.
+  std::vector<PaddleTensor> input_slots(1);
+  // one batch starts
+  // data --
+  int64_t data0[] = {0, 1, 2};
+  for (auto &input : input_slots) {
+    input.data.Reset(data0, sizeof(data0));
+    input.shape = std::vector<int>({3, 1});
+    // dtype --
+    input.dtype = PaddleDType::INT64;
+    // LoD --
+    input.lod = std::vector<std::vector<size_t>>({{0, 3}});
+  }
+
+  // shape --
+  // Create Predictor --
+  AnalysisConfig config;
+  config.model_dir = FLAGS_infer_model;
+  config.use_gpu = false;
+  config.enable_ir_optim = true;
+  config.ir_passes.push_back("fc_lstm_fuse_pass");
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+
+  inference::Timer timer;
+  double sum = 0;
+  std::vector<PaddleTensor> output_slots;
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    timer.tic();
+    CHECK(predictor->Run(input_slots, &output_slots));
+    sum += timer.toc();
+  }
+  PrintTime(sum, batch_size, FLAGS_repeat);
+
+  // Get output
+  LOG(INFO) << "get outputs " << output_slots.size();
+
+  for (auto &output : output_slots) {
+    LOG(INFO) << "output.shape: " << to_string(output.shape);
+    // no lod ?
+    CHECK_EQ(output.lod.size(), 0UL);
+    LOG(INFO) << "output.dtype: " << output.dtype;
+    std::stringstream ss;
+    for (int i = 0; i < 5; i++) {
+      ss << static_cast<float *>(output.data.data())[i] << " ";
+    }
+    LOG(INFO) << "output.data summary: " << ss.str();
+    // one batch ends
+  }
+}
+
+TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+
+}  // namespace paddle
+
+USE_PASS(fc_fuse_pass);
+USE_PASS(seq_concat_fc_fuse_pass);
+USE_PASS(fc_lstm_fuse_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(infer_clean_graph_pass);
+USE_PASS(attention_lstm_fuse_pass);
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -18,10 +18,7 @@ if(APPLE)
 endif(APPLE)


-set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager
-  graph_viz_pass fc_fuse_pass
-  infer_clean_graph_pass
-  )
+set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB})

 if(WITH_GPU AND TENSORRT_FOUND)
    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
@@ -47,7 +44,19 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)

 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor)
-cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api)
+cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api
+          analysis
+          ir_pass_manager
+          pass
+          fc_fuse_pass
+          fc_lstm_fuse_pass
+          seq_concat_fc_fuse_pass
+          graph_viz_pass
+          infer_clean_graph_pass
+          graph_pattern_detector
+          infer_clean_graph_pass
+          attention_lstm_fuse_pass
+  )

 cc_test(test_paddle_inference_api
        SRCS api_tester.cc

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -14,10 +14,13 @@

 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/singleton.h"

 namespace paddle {
@@ -27,10 +30,11 @@ bool AnalysisPredictor::Init(
  VLOG(3) << "Predictor::init()";
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
+    LOG(WARNING) << "ir optimize only supports CPU currently";
+    config_.enable_ir_optim = false;
  } else {
    place_ = paddle::platform::CPUPlace();
  }
-  PADDLE_ENFORCE(!parent_scope);
  if (parent_scope) {
    scope_ = parent_scope;
    sub_scope_ = &(parent_scope->NewScope());
@@ -72,7 +76,7 @@ bool AnalysisPredictor::Init(

 void AnalysisPredictor::OptimizeInferenceProgram() {
  LOG(INFO) << "optimize begin";
-  FLAGS_IA_enable_ir = true;
+  FLAGS_IA_enable_ir = config_.enable_ir_optim;
  FLAGS_IA_enable_tensorrt_subgraph_engine = false;
  FLAGS_IA_output_storage_path = "";  // Don't output the model.
  // Analyze inference_program
@@ -89,24 +93,26 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }
  argument_.origin_program_desc.reset(
      new ProgramDesc(*inference_program_->Proto()));
-  Analyzer().Run(&argument_);
+  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
+                 "Only kExclude is supported yet.");
+  Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
+
  CHECK(argument_.transformed_program_desc);
  VLOG(5) << "to prepare executor";
-  // LOG(INFO) << "transformed_parogram_desc " <<
-  // argument.transformed_program_desc->DebugString();
  inference_program_.reset(
      new framework::ProgramDesc(*argument_.transformed_program_desc));
-  PADDLE_ENFORCE(argument_.Has(framework::ir::kParamScopeAttr));
+  if (argument_.Has(framework::ir::kParamScopeAttr)) {
    // Update scope.
    scope_.reset(
        argument_.Release<framework::Scope>(framework::ir::kParamScopeAttr));
-  LOG(INFO) << "optimize end ==";
+  }
+  LOG(INFO) << "== optimize end ==";
 }

 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    NativeConfig, PaddleEngineKind::kAnalysis>(const NativeConfig& config) {
-  VLOG(3) << "create NativePredictor";
+    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+  VLOG(3) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
@@ -133,7 +139,3 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 }

 }  // namespace paddle
-
-USE_PASS(fc_fuse_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(infer_clean_graph_pass);
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <string>
+#include <vector>
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -28,7 +30,7 @@ using framework::proto::ProgramDesc;
 */
 class AnalysisPredictor : public NativePaddlePredictor {
 public:
-  explicit AnalysisPredictor(const NativeConfig& config)
+  explicit AnalysisPredictor(const AnalysisConfig& config)
      : NativePaddlePredictor(config), config_(config) {}

  bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -44,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
  Argument& analysis_argument() { return argument_; }

 private:
-  NativeConfig config_;
+  AnalysisConfig config_;
  Argument argument_;
 };


--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <sys/time.h>
 #include <algorithm>
 #include <map>
 #include <set>
@@ -23,32 +22,14 @@ limitations under the License. */

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(profile, false, "Turn on profiler for fluid");

 namespace paddle {
 namespace {
-
-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
+using paddle::inference::Timer;

 template <class T>
 std::string num2str(T a) {
@@ -62,14 +43,14 @@ void NativePaddlePredictor::PrepareFeedFetch() {
  for (auto *op : inference_program_->Block(0).AllOps()) {
    if (op->Type() == "feed") {
      int idx = boost::get<int>(op->GetAttr("col"));
-      if (feeds_.size() <= (size_t)idx) {
+      if (feeds_.size() <= static_cast<size_t>(idx)) {
        feeds_.resize(idx + 1);
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
    } else if (op->Type() == "fetch") {
      int idx = boost::get<int>(op->GetAttr("col"));
-      if (fetchs_.size() <= (size_t)idx) {
+      if (fetchs_.size() <= static_cast<size_t>(idx)) {
        fetchs_.resize(idx + 1);
      }
      fetchs_[idx] = op;
@@ -80,7 +61,7 @@ void NativePaddlePredictor::PrepareFeedFetch() {
 bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
  VLOG(3) << "Predictor::init()";
-
+#if !defined(_WIN32)
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
    LOG(INFO) << "You can turn off by set gflags '-profile false'";
@@ -89,6 +70,7 @@ bool NativePaddlePredictor::Init(
                                           : platform::ProfilerState::kCPU;
    platform::EnableProfiler(tracking_device);
  }
+#endif

  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
@@ -133,10 +115,12 @@ bool NativePaddlePredictor::Init(
 }

 NativePaddlePredictor::~NativePaddlePredictor() {
+#if !defined(_WIN32)
  if (FLAGS_profile) {
    platform::DisableProfiler(platform::EventSortingKey::kTotal,
                              "./profile.log");
  }
+#endif
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }
@@ -179,15 +163,21 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
    LOG(ERROR) << "fail to call Init";
    return nullptr;
  }
+#ifdef __clang__
+  // fix clang compile error
+  return cls;
+#else
  // fix manylinux compile error.
  return std::move(cls);
+#endif
 }

 bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                    framework::Scope *scope) {
  VLOG(3) << "Predictor::set_feed";
  if (inputs.size() != feeds_.size()) {
-    LOG(ERROR) << "wrong feed input size.";
+    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
+               << inputs.size();
    return false;
  }
  for (size_t i = 0; i < inputs.size(); ++i) {
@@ -329,7 +319,12 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }
+#ifdef __clang__
+  // fix clang compile error
+  return predictor;
+#else
  return std::move(predictor);
+#endif
 }

 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -3,6 +3,11 @@ cmake_minimum_required(VERSION 3.0)
 project(cpp_inference_demo CXX C)

 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WIN32)
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+else()
+set(CMAKE_STATIC_LIBRARY_PREFIX "")
+endif()

 if(NOT DEFINED PADDLE_LIB)
  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
@@ -32,44 +37,56 @@ endif(NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")

+if (NOT WIN32)
 link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
 link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+endif(NOT WIN32)
+
 link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
 link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
 link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+link_directories("${PADDLE_LIB}/paddle/fluid/inference")

 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)

 if(WITH_MKL)
  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so 
-               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5.so)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} 
+               ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
  set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn")
  if(EXISTS ${MKLDNN_PATH})
    include_directories("${MKLDNN_PATH}/include")
    set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
  endif()
 else()
-  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
 endif()

 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX})
 else()
  set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
+      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()
-set(EXTERNAL_LIB "-lrt -ldl -lpthread")

+if (NOT WIN32)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
    ${MATH_LIB} ${MKLDNN_LIB}
    glog gflags protobuf snappystream snappy z
    ${EXTERNAL_LIB})
+else()
+set(DEPS ${DEPS}
+    ${MATH_LIB} ${MKLDNN_LIB}
+    ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+    ${EXTERNAL_LIB})
+endif(NOT WIN32)
+
 if(WITH_GPU)
-  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart.so)
+  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
 endif()

 target_link_libraries(${DEMO_NAME} ${DEPS})
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -14,7 +14,7 @@ else
 fi

 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}

 # download vis_demo data
 function download() {

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -14,36 +14,19 @@

 #pragma once

+#include <glog/logging.h>
 #include <sys/time.h>
 #include <algorithm>
+#include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/timer.h"

 namespace paddle {
 namespace inference {

-// Timer for timer
-class Timer {
- public:
-  double start;
-  double startu;
-  void tic() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    start = tp.tv_sec;
-    startu = tp.tv_usec;
-  }
-  double toc() {
-    struct timeval tp;
-    gettimeofday(&tp, NULL);
-    double used_time_ms =
-        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
-    return used_time_ms;
-  }
-};
-
 static void split(const std::string &str, char sep,
                  std::vector<std::string> *pieces) {
  pieces->clear();
@@ -106,5 +89,45 @@ static void TensorAssignData(PaddleTensor *tensor,
  }
 }

+std::string DescribeTensor(const PaddleTensor &tensor) {
+  std::stringstream os;
+  os << "Tensor [" << tensor.name << "]\n";
+  os << " - type: ";
+  switch (tensor.dtype) {
+    case PaddleDType::FLOAT32:
+      os << "float32";
+      break;
+    case PaddleDType::INT64:
+      os << "int64";
+      break;
+    default:
+      os << "unset";
+  }
+  os << '\n';
+
+  os << " - shape: " << to_string(tensor.shape) << '\n';
+  os << " - lod: ";
+  for (auto &l : tensor.lod) {
+    os << to_string(l) << "; ";
+  }
+  os << "\n";
+  os << " - data: ";
+
+  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
+                            [](int a, int b) { return a * b; });
+  for (int i = 0; i < dim; i++) {
+    os << static_cast<float *>(tensor.data.data())[i] << " ";
+  }
+  os << '\n';
+  return os.str();
+}
+
+void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+               double latency) {
+  LOG(INFO) << "batch_size: " << batch_size << ", repeat: " << repeat
+            << ", threads: " << num_threads << ", thread id: " << tid
+            << ", latency: " << latency << "ms";
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -150,6 +150,21 @@ struct TensorRTConfig : public NativeConfig {
  int workspace_size{1 << 30};
 };

+// NOTE WIP, not stable yet.
+struct AnalysisConfig : public NativeConfig {
+  //
+  enum class IrPassMode {
+    kSystem,   // Use system default passes, not customize.
+    kInclude,  // Specify the passes in `ir_passes`.
+    kExclude   // Specify the disabled passes in `ir_passes`.
+  };
+
+  bool enable_ir_optim = true;
+  IrPassMode ir_mode{IrPassMode::kExclude};
+  // attention lstm fuse works only on some specific models, disable as default.
+  std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+};
+
 // A factory to help create different predictors.
 //
 // FOR EXTENSION DEVELOPER:

--- a/paddle/fluid/inference/api/timer.h
+++ b/paddle/fluid/inference/api/timer.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <chrono>  // NOLINT
+
+namespace paddle {
+namespace inference {
+
+// Timer for timer
+class Timer {
+ public:
+  std::chrono::high_resolution_clock::time_point start;
+  std::chrono::high_resolution_clock::time_point startu;
+
+  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  double toc() {
+    startu = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> time_span =
+        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
+                                                                  start);
+    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
+    return used_time_ms;
+  }
+};
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/paddle_fluid.map
+++ b/paddle/fluid/inference/paddle_fluid.map
 {
 	global:
 		*paddle*;
+                *Pass*;
 	local:
 		*;
 };
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -178,6 +178,8 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(relu);\n")
      elseif(${TARGET} STREQUAL "fake_dequantize")
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      elseif(${TARGET} STREQUAL "fake_quantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
      elseif(${TARGET} STREQUAL "fc")
@@ -293,6 +295,7 @@ op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
+op_library(fake_quantize_op DEPS memory)

 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -865,8 +865,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    auto temp1 = static_cast<T>(1) /
                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
-    dx.device(d) = dout * ((beta * out) + temp2);
+    auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
+    dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
  }
 };


--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/attention_lstm_op.h"
-#include <sys/time.h>
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"

--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/auc_op.h"
-#include <string>

 namespace paddle {
 namespace operators {
@@ -36,15 +35,12 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(predict_height, label_height,
                      "Out and Label should have same height.");

-    int num_thres = ctx->Attrs().Get<int>("num_thresholds");
+    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;

    ctx->SetOutputDim("AUC", {1});
-    ctx->SetOutputDim("TPOut", {num_thres});
-    ctx->SetOutputDim("TNOut", {num_thres});
-    ctx->SetOutputDim("FPOut", {num_thres});
-    ctx->SetOutputDim("FNOut", {num_thres});
-
-    ctx->ShareLoD("Predict", /*->*/ "AUC");
+    ctx->SetOutputDim("BatchAUC", {1});
+    ctx->SetOutputDim("StatPosOut", {num_pred_buckets});
+    ctx->SetOutputDim("StatNegOut", {num_pred_buckets});
  }

 protected:
@@ -66,25 +62,24 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label",
             "A 2D int tensor indicating the label of the training data. "
             "shape: [batch_size, 1]");
-    AddInput("TP", "True-Positive value.");
-    AddInput("FP", "False-Positive value.");
-    AddInput("TN", "True-Negative value.");
-    AddInput("FN", "False-Negative value.");
    // TODO(typhoonzero): support weight input
+    AddInput("StatPos", "Statistic value when label = 1");
+    AddInput("StatNeg", "Statistic value when label = 0");
+
    AddOutput("AUC",
              "A scalar representing the "
              "current area-under-the-curve.");
-    AddOutput("TPOut", "True-Positive value.");
-    AddOutput("FPOut", "False-Positive value.");
-    AddOutput("TNOut", "True-Negative value.");
-    AddOutput("FNOut", "False-Negative value.");
+    AddOutput("BatchAUC", "The AUC for current batch");
+    AddOutput("StatPosOut", "Statistic value when label = 1");
+    AddOutput("StatNegOut", "Statistic value when label = 0");

    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
        .SetDefault("ROC");
+
    AddAttr<int>("num_thresholds",
                 "The number of thresholds to use when discretizing the"
                 " roc curve.")
-        .SetDefault(200);
+        .SetDefault((2 << 12) - 1);

    AddComment(R"DOC(
 Area Under The Curve (AUC) Operator.

--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

 namespace paddle {
@@ -23,106 +23,85 @@ namespace operators {

 using Tensor = framework::Tensor;

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename DeviceContext, typename T>
 class AucKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predict");
-    auto* label = ctx.Input<Tensor>("Label");
-    auto* auc = ctx.Output<Tensor>("AUC");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *predict = ctx.Input<Tensor>("Predict");
+    auto *label = ctx.Input<Tensor>("Label");
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    int num_pred_buckets = num_thresholds + 1;
+
    // Only use output var for now, make sure it's persistable and
    // not cleaned up for each batch.
-    auto* true_positive = ctx.Output<Tensor>("TPOut");
-    auto* false_positive = ctx.Output<Tensor>("FPOut");
-    auto* true_negative = ctx.Output<Tensor>("TNOut");
-    auto* false_negative = ctx.Output<Tensor>("FNOut");
+    auto *auc = ctx.Output<Tensor>("AUC");
+    auto *stat_pos = ctx.Output<Tensor>("StatPosOut");
+    auto *stat_neg = ctx.Output<Tensor>("StatNegOut");

-    auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
+    auto *stat_pos_data = stat_pos->mutable_data<int64_t>(ctx.GetPlace());
+    auto *stat_neg_data = stat_neg->mutable_data<int64_t>(ctx.GetPlace());
+    calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds,
+            auc);

-    std::string curve = ctx.Attr<std::string>("curve");
-    int num_thresholds = ctx.Attr<int>("num_thresholds");
-    std::vector<double> thresholds_list;
-    thresholds_list.reserve(num_thresholds);
-    for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
+    auto *batch_auc = ctx.Output<Tensor>("BatchAUC");
+    std::vector<int64_t> stat_pos_batch(num_pred_buckets, 0);
+    std::vector<int64_t> stat_neg_batch(num_pred_buckets, 0);
+    calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(),
+            num_thresholds, batch_auc);
+  }
+
+ private:
+  inline static double trapezoidArea(double X1, double X2, double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
  }
-    const double kEpsilon = 1e-7;
-    thresholds_list[0] = 0.0f - kEpsilon;
-    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;

+  inline static void calcAuc(const framework::ExecutionContext &ctx,
+                             const framework::Tensor *label,
+                             const framework::Tensor *predict,
+                             int64_t *stat_pos, int64_t *stat_neg,
+                             int num_thresholds,
+                             framework::Tensor *auc_tensor) {
    size_t batch_size = predict->dims()[0];
    size_t inference_width = predict->dims()[1];
+    const T *inference_data = predict->data<T>();
+    const auto *label_data = label->data<int64_t>();

-    const T* inference_data = predict->data<T>();
-    const auto* label_data = label->data<int64_t>();
-
-    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
-    auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
+    auto *auc = auc_tensor->mutable_data<double>(ctx.GetPlace());

-    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
-      // calculate TP, FN, TN, FP for current thresh
-      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
    for (size_t i = 0; i < batch_size; i++) {
-        // NOTE: label_data used as bool, labels > 0 will be treated as true.
+      uint32_t binIdx = static_cast<uint32_t>(
+          inference_data[i * inference_width + 1] * num_thresholds);
      if (label_data[i]) {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            tp++;
-          } else {
-            fn++;
-          }
-        } else {
-          if (inference_data[i * inference_width + 1] >=
-              (thresholds_list[idx_thresh])) {
-            fp++;
+        stat_pos[binIdx] += 1.0;
      } else {
-            tn++;
-          }
-        }
-      }
-      // store rates
-      tp_data[idx_thresh] += tp;
-      fn_data[idx_thresh] += fn;
-      tn_data[idx_thresh] += tn;
-      fp_data[idx_thresh] += fp;
-    }
-    // epsilon to avoid divide by zero.
-    double epsilon = 1e-6;
-    // Riemann sum to caculate auc.
-    Tensor tp_rate, fp_rate, rec_rate;
-    tp_rate.Resize({num_thresholds});
-    fp_rate.Resize({num_thresholds});
-    rec_rate.Resize({num_thresholds});
-    auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
-    auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
-    for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                        (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] =
-          static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
-                         (tp_data[i] + fp_data[i] + epsilon);
+        stat_neg[binIdx] += 1.0;
      }
-    *auc_data = 0.0f;
-    if (curve == "ROC") {
-      for (int i = 0; i < num_thresholds - 1; i++) {
-        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
-        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
    }
-    } else if (curve == "PR") {
-      for (int i = 1; i < num_thresholds; i++) {
-        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
-        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
-        *auc_data = *auc_data + dx * y;
+
+    *auc = 0.0f;
+
+    double totPos = 0.0;
+    double totNeg = 0.0;
+    double totPosPrev = 0.0;
+    double totNegPrev = 0.0;
+
+    int idx = num_thresholds;
+
+    while (idx >= 0) {
+      totPosPrev = totPos;
+      totNegPrev = totNeg;
+      totPos += stat_pos[idx];
+      totNeg += stat_neg[idx];
+      *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+
+      --idx;
    }
+
+    if (totPos > 0.0 && totNeg > 0.0) {
+      *auc = *auc / totPos / totNeg;
    }
  }
 };

--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * transform that computes target bounding-box regression deltas
+ * given proposal boxes and ground-truth boxes.
+ */
+template <typename T>
+inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
+                       const framework::Tensor& gt_boxes, const T* weights,
+                       const bool normalized, framework::Tensor* box_delta) {
+  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
+  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
+  auto trg = framework::EigenTensor<T, 2>::From(*box_delta);
+  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
+  for (int64_t i = 0; i < box_num; ++i) {
+    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false);
+    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false);
+    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
+    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
+
+    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false);
+    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false);
+    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
+    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
+
+    trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w;
+    trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h;
+    trg(i, 2) = std::log(gt_w / ex_w);
+    trg(i, 3) = std::log(gt_h / ex_h);
+
+    if (weights) {
+      trg(i, 0) = trg(i, 0) / weights[0];
+      trg(i, 1) = trg(i, 1) / weights[1];
+      trg(i, 2) = trg(i, 2) / weights[2];
+      trg(i, 3) = trg(i, 3) / weights[3];
+    }
+  }
+}
+
+template <typename T>
+void Gather(const T* in, const int in_stride, const int* index, const int num,
+            T* out) {
+  const int stride_bytes = in_stride * sizeof(T);
+  for (int i = 0; i < num; ++i) {
+    int id = index[i];
+    memcpy(out + i * in_stride, in + id * in_stride, stride_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -133,31 +134,6 @@ void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
  }
 }

-template <typename T>
-void BoxToDelta(int box_num, const Tensor& ex_boxes, const Tensor& gt_boxes,
-                const std::vector<float>& weights, Tensor* box_delta) {
-  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
-  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
-  auto box_delta_et = framework::EigenTensor<T, 2>::From(*box_delta);
-  T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y;
-  for (int64_t i = 0; i < box_num; ++i) {
-    ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + 1;
-    ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + 1;
-    ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w;
-    ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h;
-
-    gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + 1;
-    gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + 1;
-    gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w;
-    gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h;
-
-    box_delta_et(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0];
-    box_delta_et(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1];
-    box_delta_et(i, 2) = log(gt_w / ex_w) / ex_w / weights[2];
-    box_delta_et(i, 3) = log(gt_h / ex_h) / ex_h / weights[3];
-  }
-}
-
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(
    const platform::CPUDeviceContext& context, Tensor* iou,
@@ -243,12 +219,11 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
                       Tensor* sampled_labels, Tensor* sampled_gts) {
  int fg_num = fg_inds.size();
  int bg_num = bg_inds.size();
-  int gt_num = fg_num + bg_num;
  Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
  int* fg_inds_data = fg_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
  int* bg_inds_data = bg_inds_t.mutable_data<int>({bg_num}, context.GetPlace());
  int* gt_box_inds_data =
-      gt_box_inds_t.mutable_data<int>({gt_num}, context.GetPlace());
+      gt_box_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
  int* gt_label_inds_data =
      gt_label_inds_t.mutable_data<int>({fg_num}, context.GetPlace());
  std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data);
@@ -303,18 +278,20 @@ std::vector<Tensor> SampleRoisForOneImage(

  // Gather boxes and labels
  Tensor sampled_boxes, sampled_labels, sampled_gts;
-  int boxes_num = fg_inds.size() + bg_inds.size();
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  int boxes_num = fg_num + bg_num;
  framework::DDim bbox_dim({boxes_num, kBoxDim});
  sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
  sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
-  sampled_gts.mutable_data<T>(bbox_dim, context.GetPlace());
+  sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
  GatherBoxesLabels<T>(context, boxes, *gt_boxes, *gt_classes, fg_inds, bg_inds,
                       gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);

  // Compute targets
  Tensor bbox_targets_single;
  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(boxes_num, sampled_boxes, sampled_gts, bbox_reg_weights,
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
                &bbox_targets_single);

  // Scale rois
@@ -427,7 +404,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
    auto rpn_rois_lod = rpn_rois->lod().back();
    auto gt_classes_lod = gt_classes->lod().back();
    auto gt_boxes_lod = gt_boxes->lod().back();
-    for (size_t i = 0; i < n; ++i) {
+    for (int i = 0; i < n; ++i) {
      Tensor rpn_rois_slice =
          rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
      Tensor gt_classes_slice =

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -311,8 +311,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {

    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
                              context.GetPlace());
-    rpn_roi_probs->mutable_data<T>({scores->numel() / 4, 1},
-                                   context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());

    Tensor bbox_deltas_swap, scores_swap;
    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
@@ -421,7 +420,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
    if (nms_thresh <= 0) {
-      return std::make_pair(bbox_sel, scores_sel);
+      return std::make_pair(bbox_sel, scores_filter);
    }

    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/math/math_function.h"

 namespace paddle {
@@ -46,156 +47,219 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
    auto in_dims = ctx->GetInputDim("DistMat");
    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
                      "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("LocationIndex", {-1});
+    ctx->SetOutputDim("ScoreIndex", {-1});
+    ctx->SetOutputDim("TargetLabel", {-1, 1});
+    ctx->SetOutputDim("TargetBBox", {-1, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
  }
 };

 template <typename T>
 class RpnTargetAssignKernel : public framework::OpKernel<T> {
 public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor_t = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_bbox_t = context.Input<Tensor>("GtBox");
+    auto* dist_t = context.Input<LoDTensor>("DistMat");
+
+    auto* loc_index_t = context.Output<Tensor>("LocationIndex");
+    auto* score_index_t = context.Output<Tensor>("ScoreIndex");
+    auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
+    auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
+
+    auto lod = dist_t->lod().back();
+    int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
+    int64_t anchor_num = dist_t->dims()[1];
+    PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
+
+    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
+    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
+    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
+    float fg_fraction = context.Attr<float>("fg_fraction");
+
+    int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
+
+    int64_t max_num = batch_num * anchor_num;
+    auto place = context.GetPlace();
+
+    tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
+    auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
+    auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
+
+    Tensor tmp_tgt_lbl;
+    auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
+    iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed =
+        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    engine.seed(seed);
+
+    int fg_num = 0;
+    int bg_num = 0;
+    for (int i = 0; i < batch_num; ++i) {
+      Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
+      Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
+      auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
+                                   rpn_batch_size, fg_num_per_batch, engine,
+                                   tmp_lbl_data + i * anchor_num);
+
+      int cur_fg_num = fg_bg_gt[0].size();
+      int cur_bg_num = fg_bg_gt[1].size();
+      std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+      memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
+      std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
+                     score_index + cur_fg_num,
+                     [i, anchor_num](int d) { return d + i * anchor_num; });
+
+      // get target bbox deltas
+      if (cur_fg_num) {
+        Tensor fg_gt;
+        T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
+        Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
+        T* tgt_data = tgt_bbox.data<T>();
+        Gather<T>(anchor_t->data<T>(), 4,
+                  reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
+                  tgt_data);
+        Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
+                  cur_fg_num, gt_data);
+        BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
+      }
+
+      loc_index += cur_fg_num;
+      score_index += cur_fg_num + cur_bg_num;
+      fg_num += cur_fg_num;
+      bg_num += cur_bg_num;
+    }
+
+    int lbl_num = fg_num + bg_num;
+    PADDLE_ENFORCE_LE(fg_num, max_num);
+    PADDLE_ENFORCE_LE(lbl_num, max_num);
+
+    tgt_bbox_t->Resize({fg_num, 4});
+    loc_index_t->Resize({fg_num});
+    score_index_t->Resize({lbl_num});
+    auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
+    Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
+                    lbl_data);
+  }
+
+ private:
  void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
                   const int row, const int col, const float pos_threshold,
-                   const float neg_threshold, int64_t* target_label_data,
+                   const float neg_threshold, int64_t* target_label,
                   std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    float epsilon = 0.0001;
    for (int64_t i = 0; i < row; ++i) {
      const T* v = dist_data + i * col;
-      T max_dist = *std::max_element(v, v + col);
+      T max = *std::max_element(v, v + col);
      for (int64_t j = 0; j < col; ++j) {
-        T val = dist_data[i * col + j];
-        if (val == max_dist) target_label_data[j] = 1;
+        if (std::abs(max - v[j]) < epsilon) {
+          target_label[j] = 1;
+        }
      }
    }

-    // Pick the fg/bg and count the number
+    // Pick the fg/bg
+    const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
    for (int64_t j = 0; j < col; ++j) {
-      if (anchor_to_gt_max.data<T>()[j] > pos_threshold) {
-        target_label_data[j] = 1;
-      } else if (anchor_to_gt_max.data<T>()[j] < neg_threshold) {
-        target_label_data[j] = 0;
+      if (anchor_to_gt_max_data[j] >= pos_threshold) {
+        target_label[j] = 1;
+      } else if (anchor_to_gt_max_data[j] < neg_threshold) {
+        target_label[j] = 0;
      }
-      if (target_label_data[j] == 1) {
-        fg_inds->push_back(fg_offset + j);
-      } else if (target_label_data[j] == 0) {
-        bg_inds->push_back(bg_offset + j);
+      if (target_label[j] == 1) {
+        fg_inds->push_back(j);
+      } else if (target_label[j] == 0) {
+        bg_inds->push_back(j);
      }
    }
  }

-  void ReservoirSampling(const int num, const int offset,
-                         std::minstd_rand engine,
+  void ReservoirSampling(const int num, std::minstd_rand engine,
                         std::vector<int>* inds) const {
    std::uniform_real_distribution<float> uniform(0, 1);
-    const int64_t size = static_cast<int64_t>(inds->size() - offset);
-    if (size > num) {
-      for (int64_t i = num; i < size; ++i) {
+    size_t len = inds->size();
+    if (len > static_cast<size_t>(num)) {
+      for (size_t i = num; i < len; ++i) {
        int rng_ind = std::floor(uniform(engine) * i);
        if (rng_ind < num)
-          std::iter_swap(inds->begin() + rng_ind + offset,
-                         inds->begin() + i + offset);
+          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
      }
+      inds->resize(num);
    }
  }

-  void RpnTargetAssign(const framework::ExecutionContext& ctx,
-                       const Tensor& dist, const float pos_threshold,
-                       const float neg_threshold, const int rpn_batch_size,
-                       const int fg_num, std::minstd_rand engine,
-                       std::vector<int>* fg_inds, std::vector<int>* bg_inds,
-                       int64_t* target_label_data) const {
+  // std::vector<std::vector<int>> RpnTargetAssign(
+  std::vector<std::vector<int>> SampleFgBgGt(
+      const platform::CPUDeviceContext& ctx, const Tensor& dist,
+      const float pos_threshold, const float neg_threshold,
+      const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
+      int64_t* target_label) const {
    auto* dist_data = dist.data<T>();
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    int fg_offset = fg_inds->size();
-    int bg_offset = bg_inds->size();
+    int row = dist.dims()[0];
+    int col = dist.dims()[1];
+
+    std::vector<int> fg_inds;
+    std::vector<int> bg_inds;
+    std::vector<int> gt_inds;

    // Calculate the max IoU between anchors and gt boxes
-    Tensor anchor_to_gt_max;
-    anchor_to_gt_max.mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(col), 1}),
-        platform::CPUPlace());
-    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    auto x = EigenMatrix<T>::From(dist);
-    auto x_col_max = EigenMatrix<T>::From(anchor_to_gt_max);
-    x_col_max.device(place) =
-        x.maximum(Eigen::DSizes<int, 1>(0))
-            .reshape(Eigen::DSizes<int, 2>(static_cast<int64_t>(col), 1));
+    // Map from anchor to gt box that has highest overlap
+    auto place = ctx.GetPlace();
+    Tensor anchor_to_gt_max, anchor_to_gt_argmax;
+    anchor_to_gt_max.mutable_data<T>({col}, place);
+    int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
+
+    auto x = framework::EigenMatrix<T>::From(dist);
+    auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+    auto x_col_argmax =
+        framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
+    x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
+    x_col_argmax = x.argmax(0).template cast<int>();
+
    // Follow the Faster RCNN's implementation
    ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
-                neg_threshold, target_label_data, fg_inds, bg_inds);
+                neg_threshold, target_label, &fg_inds, &bg_inds);
    // Reservoir Sampling
-    ReservoirSampling(fg_num, fg_offset, engine, fg_inds);
-    int bg_num = rpn_batch_size - (fg_inds->size() - fg_offset);
-    ReservoirSampling(bg_num, bg_offset, engine, bg_inds);
-  }
+    ReservoirSampling(fg_num, engine, &fg_inds);
+    int fg_num2 = static_cast<int>(fg_inds.size());
+    int bg_num = rpn_batch_size - fg_num2;
+    ReservoirSampling(bg_num, engine, &bg_inds);

-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist = context.Input<LoDTensor>("DistMat");
-    auto* loc_index = context.Output<Tensor>("LocationIndex");
-    auto* score_index = context.Output<Tensor>("ScoreIndex");
-    auto* tgt_lbl = context.Output<Tensor>("TargetLabel");
-
-    auto col = dist->dims()[1];
-    int64_t n = dist->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist->lod().back().size() - 1);
-    if (dist->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
+    gt_inds.reserve(fg_num2);
+    for (int i = 0; i < fg_num2; ++i) {
+      gt_inds.emplace_back(argmax[fg_inds[i]]);
    }
-    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
-    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
-    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-
-    int fg_num = static_cast<int>(rpn_batch_size * fg_fraction);
+    std::vector<std::vector<int>> fg_bg_gt;
+    fg_bg_gt.emplace_back(fg_inds);
+    fg_bg_gt.emplace_back(bg_inds);
+    fg_bg_gt.emplace_back(gt_inds);

-    int64_t* target_label_data =
-        tgt_lbl->mutable_data<int64_t>({n * col, 1}, context.GetPlace());
-
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
-    iset(dev_ctx, tgt_lbl, static_cast<int>(-1));
-
-    std::vector<int> fg_inds;
-    std::vector<int> bg_inds;
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-    engine.seed(seed);
-
-    if (n == 1) {
-      RpnTargetAssign(context, *dist, pos_threshold, neg_threshold,
-                      rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                      target_label_data);
-    } else {
-      auto lod = dist->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist->Slice(lod[i], lod[i + 1]);
-        RpnTargetAssign(context, one_ins, pos_threshold, neg_threshold,
-                        rpn_batch_size, fg_num, engine, &fg_inds, &bg_inds,
-                        target_label_data + i * col);
-      }
-    }
-    int* loc_index_data = loc_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size())}, context.GetPlace());
-    int* score_index_data = score_index->mutable_data<int>(
-        {static_cast<int>(fg_inds.size() + bg_inds.size())},
-        context.GetPlace());
-    memcpy(loc_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data, reinterpret_cast<int*>(&fg_inds[0]),
-           fg_inds.size() * sizeof(int));
-    memcpy(score_index_data + fg_inds.size(),
-           reinterpret_cast<int*>(&bg_inds[0]), bg_inds.size() * sizeof(int));
+    return fg_bg_gt;
  }
 };

 class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
+    AddInput("Anchor",
+             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
+    AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
    AddInput(
        "DistMat",
        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
@@ -241,12 +305,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
        "ScoreIndex",
        "(Tensor), The indexes of foreground and background anchors in all "
        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B depend on the value of input "
-        "tensor and attributes.");
-    AddOutput("TargetLabel",
+        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
+        " number.");
+    AddOutput("TargetBBox",
+              "(Tensor<int64_t>), The target bbox deltas with shape "
+              "[F, 4], F is the sampled foreground number.");
+    AddOutput(
+        "TargetLabel",
        "(Tensor<int64_t>), The target labels of each anchor with shape "
-              "[K * M, 1], "
-              "K and M is the same as they are in DistMat.");
+        "[F + B, 1], F and B are sampled foreground and backgroud number.");
    AddComment(R"DOC(
 This operator can be, for given the IoU between the ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.

--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -39,8 +39,17 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                const std::string& out_var_name) {
  VLOG(4) << "RequestSendHandler:" << varname;

+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
+    rpc_server_->Complete();
+  } else {
    // Async
    if (!sync_mode_) {
+      VLOG(3) << "async process var: " << varname;
      rpc_server_->Profiler().OneStep();
      try {
        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
@@ -50,17 +59,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
        return false;
      }
      return true;
-  }
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-    rpc_server_->Complete();
-  } else {
-    VLOG(3) << "sync: received var_name: " << varname;
+    } else {  // sync
      rpc_server_->WaitCond(kRequestSend);
      VLOG(3) << "sync: processing received var: " << varname;

@@ -68,11 +67,13 @@ bool RequestSendHandler::Handle(const std::string& varname,
        LOG(FATAL) << "sync: Can not find server side var: " << varname;
        return false;
      }
+
      if (invar->IsType<framework::SelectedRows>()) {
        std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
        sparse_vars_.push_back(invar);
      }
    }
+  }
  return true;
 }


--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <glog/logging.h>
 #include <algorithm>
+#include <iterator>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -94,8 +95,11 @@ class RowwiseTransformIterator;
 template <typename T, typename DeviceContext>
 class MidWiseTransformIterator;

+// NOTE(dzhwinter): ptrdiff_t in iterator is deperecated in c++17
 template <typename T>
-class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+class RowwiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
 public:
  RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}

@@ -126,7 +130,9 @@ class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
 };

 template <typename T>
-class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+class MidWiseTransformIterator<T, platform::CPUDeviceContext>
+    : public std::iterator<std::random_access_iterator_tag, T, std::ptrdiff_t,
+                           T *, T &> {
 public:
  MidWiseTransformIterator(const T *ptr, int n, int post)
      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
@@ -479,8 +485,13 @@ void ElemwiseGradComputeNoBroadcast(
    const framework::Tensor &dout, int axis, framework::Tensor *dx,
    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
  size_t N = static_cast<size_t>(framework::product(x_dim));
+#if !defined(_WIN32)
  platform::ForRange<DeviceContext> for_range(
      ctx.template device_context<DeviceContext>(), N);
+#else
+  platform::ForRange<DeviceContext> for_range(
+      ctx.device_context<DeviceContext>(), N);
+#endif  // !_WIN32
  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP>{
      x.data<T>(), y.data<T>(), out.data<T>(), dout.data<T>(), dx_op, dy_op,
      dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
@@ -633,13 +644,13 @@ void ElementwiseGradCompute(const framework::ExecutionContext &ctx,

 template <typename Functor, typename DeviceContext, typename T,
          typename OutType = T>
+
 void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                          const framework::Tensor *x,
                          const framework::Tensor *y, int axis, Functor func,
                          framework::Tensor *z) {
  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
      x, y, z, ctx.template device_context<DeviceContext>(), func);
-
  auto x_dims = x->dims();
  auto y_dims_untrimed = y->dims();
  PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(),

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -14,86 +14,198 @@ limitations under the License. */

 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/platform/transform.h"

 namespace paddle {
 namespace operators {

-class FakeQuantizeOp : public framework::OperatorWithKernel {
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVectorArrayMap =
+    Eigen::TensorMap<Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using ConstEigenVectorArrayMap =
+    Eigen::TensorMap<const Eigen::Tensor<T, 1, MajorType, IndexType>>;
+
+template <typename T>
+struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    Eigen::DSizes<Eigen::DenseIndex, 1> idim(num);
+    Eigen::DSizes<Eigen::DenseIndex, 1> odim(1);
+    Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor>> in_e(in, idim);
+    Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>> out_e(out, odim);
+
+    out_e = in_e.abs().maximum();
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    T s = scale.data<T>()[0];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
+          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
+    auto in_e = framework::EigenVector<T>::Flatten(in);
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    out_e.device(*ctx.eigen_device()) = (bin_cnt / s * in_e).round();
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
+
+template <typename T>
+struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    T* scale_arr = scales_arr->mutable_data<T>(ctx.GetPlace());
+    int64_t it = iter.data<int64_t>()[0];
+    int idx = it % window_size;
+    T removed = scale_arr[idx];
+    T cur = cur_scale.data<T>()[0];
+    scale_arr[idx] = cur;
+
+    T max = last_scale.data<T>()[0];
+    if (max < cur) {
+      max = cur;
+    } else if (fabs(removed - max) < 1e-6) {
+      int size = (it > window_size) ? window_size : it;
+      FindAbsMaxFunctor<platform::CPUDeviceContext, T>()(ctx, scale_arr, size,
+                                                         &max);
+    }
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = max;
+  }
+};
+
+template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
+class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
 public:
-  FakeQuantizeOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
+  FakeQuantizeAbsMaxOp(const std::string& type,
+                       const framework::VariableNameMap& inputs,
+                       const framework::VariableNameMap& outputs,
+                       const framework::AttributeMap& attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FakeQuantizeOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FakeQuantizeOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutMovingScale"),
-                   "OutMovingScale(Out) of FakeQuantizeOp should not be null");
-    // if (ctx->HasInput("InMovingScale")) {
-    ctx->SetOutputDim("OutMovingScale", ctx->GetInputDim("InMovingScale"));
-    //}
-    // if (ctx->HasInput("InScales")) {
-    PADDLE_ENFORCE(ctx->HasOutput("OutScales"),
-                   "OutScales(Out) of FakeQuantizeOp should not be null");
-    ctx->SetOutputDim("OutScales", ctx->GetInputDim("InScales"));
-    // PADDLE_ENFORCE_EQ(ctx->Inputs("InScales")[0],
-    // ctx->Outputs("OutScales")[0],
-    //                  "Mean and MeanOut should share the same memory");
-    //}
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(Scale) of FakeQuantizeOp should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };

-class FakeQuantizeOpMaker : public framework::OpProtoAndCheckerMaker {
+class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of scale operator.");
-    AddInput("InScales", "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddInput("InMovingScale", "Last scale, used in static quantization.")
-        .AsDispensable();
-    AddInput("InCurrentIter",
-             "Last iteration number, used in static quantization.")
-        .AsDispensable();
-    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
-    AddOutput("OutScales",
-              "(Tensor) scale buffer, used in static quantization.")
-        .AsDispensable();
-    AddOutput("OutMovingScale", " Current scale");
-    AddOutput("OutCurrentIter", "Current iteration number.").AsDispensable();
-    AddAttr<std::string>("quantize_type",
-                         "(string, default abs_max)"
-                         "The scaling tpe of the quantize operator.")
-        .SetDefault("abs_max");
-    AddAttr<int>("window_size", "(int, default 10000)").SetDefault(10000);
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized low level tensor, "
+              "but also saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current scale");
    AddAttr<int>("bit_length", "(int, default 8)")
        .SetDefault(8)
-        .AddCustomChecker([](const int &bit_length) {
+        .AddCustomChecker([](const int& bit_length) {
          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
                         "'bit_length' should be between 1 and 16.");
        });
-    AddAttr<bool>("is_test", "").SetDefault(false);
    AddComment(R"DOC(
 FakeQuantize operator

-quantize_type = abs_max:
+$$scale = max(abs(X))$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$

-    $$scale = max(abs(x))$$ 
+)DOC");
+  }
+};

-quantize_type = range_abs_max:
+class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeRangeAbsMaxOp(const std::string& type,
+                            const framework::VariableNameMap& inputs,
+                            const framework::VariableNameMap& outputs,
+                            const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-    $$scale = max(max(abs(x)), history_abs_max)$$ 
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeRangeAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutScale"),
+        "Output(OutScale) of FakeQuantizeRangeAbsMaxOp should not be null");
+    if (ctx->HasOutput("OutScales")) {
+      int window_size = ctx->Attrs().Get<int>("window_size");
+      ctx->SetOutputDim("OutScales", {window_size});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }

-quantize_type = moving_average_abs_max:
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};

-    $$scale = 0.1*scale+0.9*new_abs_max)$$ 
+class FakeQuantizeRangeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("Iter", "Global step iteration.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutScales", "(Tensor) scale buffer.").AsDispensable();
+    AddAttr<int>("window_size", "(int, default 10000) window range size.")
+        .SetDefault(10000);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.

-$$Out = scale*X$$
+$$scale = max(max(abs(x)), history_abs_max)$$ 
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$

 )DOC");
  }
@@ -103,10 +215,16 @@ $$Out = scale*X$$
 }  // namespace paddle

 namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
+                  ops::FakeQuantizeAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
+                       ops::FakeQuantizeAbsMaxKernel<CPU, float>);

-REGISTER_OPERATOR(fake_quantize, ops::FakeQuantizeOp, ops::FakeQuantizeOpMaker,
+REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
+                  ops::FakeQuantizeRangeAbsMaxOpMaker,
                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FakeQuantizeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
+                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <string>
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -20,7 +21,7 @@ namespace paddle {
 namespace operators {

 template <typename T>
-__global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
+__global__ void FindAbsMaxKernel(const T* in, const int n, T* out) {
  int bid = threadIdx.x + blockIdx.x * blockDim.x;
  int tid = threadIdx.x;

@@ -43,7 +44,7 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
  __syncthreads();

  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i && shared_max_data[tid] < shared_max_data[tid + i]) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
      shared_max_data[tid] = shared_max_data[tid + i];
    }
    __syncthreads();
@@ -53,220 +54,125 @@ __global__ void FindAbsMaxKernel(const int n, const T* in, T* out) {
  }
 }

-float FindAbsMaxGpu(const platform::CUDADeviceContext& ctx, const float* array,
-                    int length) {
-  float host_max;
-  int kNumTheads = 1024;
-  int gridDimx = (kNumTheads - 1 + length) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  float* device_max = t.mutable_data<float>(framework::make_ddim({gridDimx}),
-                                            platform::CUDAPlace());
-  FindAbsMaxKernel<float><<<gridDimx, kNumTheads, kNumTheads * sizeof(float),
-                            ctx.stream()>>>(length, array, device_max);
-  FindAbsMaxKernel<
-      float><<<1, kNumTheads, kNumTheads * sizeof(float), ctx.stream()>>>(
-      gridDimx, device_max, device_max);
-  PADDLE_ENFORCE_EQ(
-      cudaMemcpy(&host_max, device_max, sizeof(float), cudaMemcpyDeviceToHost),
-      cudaSuccess, "cudaMemcpy failed");
-  return host_max;
-}
+template <typename T>
+struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, T* out) {
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+    grid = (grid > block) ? block : grid;
+
+    framework::Tensor max;
+    T* max_data =
+        max.mutable_data<T>(framework::make_ddim({grid}), ctx.GetPlace());
+    FindAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, max_data);
+    FindAbsMaxKernel<T><<<1, block, 1024 * sizeof(T), ctx.stream()>>>(
+        max_data, grid, out);
+  }
+};
+
+template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;

 template <typename T>
-__global__ void ApplySaturateKernel(const int n, const T* in, T* out,
-                                    int* num_saturate, const T min,
-                                    const T max) {
+__global__ void ClipAndQuantKernel(const T* in, const T* scale,
+                                   const int bin_cnt, const int n, T* out) {
  int bid = threadIdx.x + blockIdx.x * blockDim.x;
  int tid = threadIdx.x;

-  extern __shared__ int shared_count[];
-  shared_count[tid] = 0;
+  T s = scale[0];
  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    if (in[i] > max) {
-      out[i] = max;
-      shared_count[tid] += 1;
-    } else if (in[i] < min) {
-      out[i] = min;
-      shared_count[tid] += 1;
-    } else {
-      out[i] = in[i];
-    }
-  }
-  __syncthreads();
-
-  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_count[tid] += shared_count[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    num_saturate[blockIdx.x] = shared_count[0];
+    T x = in[bid];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out[bid] = round(v);
  }
 }

 template <typename T>
-__global__ void ReduceKernel(const int n, const T* in, T* out) {
-  int tid = threadIdx.x;
-  extern __shared__ T shared_sum[];
-  if (tid < n) {
-    shared_sum[tid] = in[tid];
+__global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
+                                            const T* last_scale,
+                                            const int64_t* iter,
+                                            const int window_size, T* scale_arr,
+                                            T* out_scale, int* need_find_max,
+                                            int* out_size) {
+  int it = iter[0];
+  int idx = it % window_size;
+  T removed = scale_arr[idx];
+  T cur = cur_scale[0];
+  scale_arr[idx] = cur;
+  T max = last_scale[0];
+  out_scale[0] = max < cur ? cur : max;
+  if (fabs(removed - max) < 1e-6) {
+    need_find_max[0] = 1;
+    out_size[0] = it > window_size ? window_size : it;
  } else {
-    shared_sum[tid] = T(0);
-  }
-  __syncthreads();
-  // blockDim.x must >= n
-  for (int i = (n + 1) / 2; i > 0; i >>= 1) {
-    if (tid < i) {
-      shared_sum[tid] += shared_sum[tid + i];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    out[0] = shared_sum[0];
+    need_find_max[0] = 0;
  }
 }

 template <typename T>
-int ApplySaturateGpu(const platform::CUDADeviceContext& ctx, const int n,
-                     const T* in, T* out, const T min, const T max) {
-  int host_num_saturate;
-  int kNumTheads = 1024;
-  int gridDimx = (n + kNumTheads - 1) / kNumTheads;
-  gridDimx = (gridDimx > kNumTheads) ? kNumTheads : gridDimx;
-  framework::Tensor t;
-  int* device_num_saturate = t.mutable_data<int>(
-      framework::make_ddim({gridDimx}), platform::CUDAPlace());
-  ApplySaturateKernel<
-      T><<<gridDimx, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      n, in, out, device_num_saturate, min, max);
-  ReduceKernel<int><<<1, kNumTheads, kNumTheads * sizeof(T), ctx.stream()>>>(
-      gridDimx, device_num_saturate, device_num_saturate);
-  PADDLE_ENFORCE_EQ(cudaSuccess,
-                    cudaMemcpy(&host_num_saturate, device_num_saturate,
-                               sizeof(int), cudaMemcpyDeviceToHost),
-                    "cudaMemcpy failed");
-  return host_num_saturate;
-}
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeCUDAKernel : public framework::OpKernel<T> {
- public:
-  T FindRangeAbsMax(const platform::CUDADeviceContext& ctx,
-                    framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMaxGpu(ctx, scale_list->data<float>(), size));
-    }
-    return max_scale;
-  }
-
-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
+struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
+    T* scale_arr = scales_arr->mutable_data<T>(gpu_place);
+    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
+
+    framework::Tensor need_find_max, out_size;
+    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+
+    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
+        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
+        window_size, scale_arr, out_scale_data, find_max, out_size_data);
+
+    int g_find_max;
+    memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
+                 sizeof(int), 0);
+    if (g_find_max) {
+      int len;
+      memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
+                   sizeof(int), 0);
+      FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
+                                                          out_scale_data);
    }
-
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto& device_ctx = context.cuda_device_context();
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-    context.Output<framework::Tensor>("OutMovingScale")
-        ->mutable_data<T>(
-            context.Input<framework::Tensor>("InMovingScale")->place());
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      context.Output<framework::Tensor>("OutScales")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InScales")->place());
-      context.Output<framework::Tensor>("OutCurrentIter")
-          ->mutable_data<T>(
-              context.Input<framework::Tensor>("InCurrentIter")->place());
  }
+};

-    T scale = T(1);
-    int window_size = context.Attr<int>("window_size");
-    T bin_cnt = (T)((1 << (context.Attr<int>("bit_length") - 1)) - 1);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-      saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
+template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;

-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        auto* it = const_cast<framework::Tensor*>(
-            context.Input<framework::Tensor>("InCurrentIter"));
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        int* last_iter = it->mutable_data<int>(platform::CPUPlace());
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        scale = FindRangeAbsMax(device_ctx, scale_list, saving_scale, scale,
-                                window_size, current_iter[0]);
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = const_cast<framework::Tensor*>(
-          context.Input<framework::Tensor>("InMovingScale"));
-      if (is_test) {
-        scale = moving_scale->mutable_data<T>(platform::CPUPlace())[0];
-      } else {
-        scale = (T)FindAbsMaxGpu(device_ctx, in->data<float>(), in->numel());
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-      }
-    }
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;

-    ApplySaturateGpu<T>(device_ctx, in->numel(), in->data<T>(),
-                        tensor->mutable_data<T>(in->place()), -scale, scale);
-    scale = bin_cnt / scale;
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());

-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (scale * eigen_in).round();
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
  }
 };

+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_CUDA_KERNEL(fake_quantize,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, float>,
-                        paddle::operators::FakeQuantizeCUDAKernel<
-                            paddle::platform::CUDADeviceContext, double>);
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
+                        ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
+                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -17,137 +17,91 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/clip_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/transform.h"

 namespace paddle {
 namespace operators {

-using platform::Transform;
+template <typename DeviceContext, typename T>
+struct FindAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num, T* out);
+};
+
+template <typename DeviceContext, typename T>
+struct ClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  framework::Tensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct FindRangeAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
+                  const framework::Tensor& last_scale,
+                  const framework::Tensor& iter, const int window_size,
+                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
+};

 template <typename DeviceContext, typename T>
-class FakeQuantizeKernel : public framework::OpKernel<T> {
+class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 public:
-  T FindAbsMax(framework::Tensor* in, int n) const {
-    T* p = in->mutable_data<T>(platform::CPUPlace());
-    T abs_max = (T)0.00000001;
-    for (int i = 0; i < n; i++) {
-      T tmp = fabs(p[i]);
-      if (tmp > abs_max) abs_max = tmp;
-    }
-    return T(abs_max);
-  }
-  T FindRangeAbsMax(framework::Tensor* scale_list, framework::Tensor* out_scale,
-                    const T& cur_scale, int window_size,
-                    int current_iter) const {
-    T* sl = scale_list->mutable_data<T>(platform::CPUPlace());
-    T remove_tmp = sl[current_iter];
-    sl[current_iter] = cur_scale;
-    T& max_scale = out_scale->mutable_data<T>(platform::CPUPlace())[0];
-    if (max_scale < cur_scale) {
-      max_scale = cur_scale;
-    } else if (fabs(remove_tmp - max_scale) < 1e-6) {
-      int size = (current_iter > window_size) ? window_size : current_iter;
-      max_scale = T(FindAbsMax(scale_list, size));
-    }
-    return max_scale;
-  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_s = out_scale->mutable_data<T>(context.GetPlace());

-  T FindMovingAverageAbsMmax(framework::Tensor* in_scale,
-                             framework::Tensor* out_scale,
-                             const T& cur_scale) const {
-    T* ins = in_scale->mutable_data<T>(platform::CPUPlace());
-    T* outs = out_scale->mutable_data<T>(platform::CPUPlace());
-    outs[0] = 0.9 * cur_scale + 0.1 * ins[0];
-    return T(outs[0]);
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const T* in_data = in->data<T>();
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
  }
+};

-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+template <typename DeviceContext, typename T>
+class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<framework::Tensor>("X");
-    const bool is_test = context.Attr<bool>("is_test");
-    tensor->mutable_data<T>(in->place());
-
-    auto* oms_tensor = context.Output<framework::Tensor>("OutMovingScale");
-    oms_tensor->mutable_data<T>(in->place());
-
-    auto quantize_type =
-        static_cast<std::string>(context.Attr<std::string>("quantize_type"));
-    if (quantize_type == std::string("range_abs_max")) {
-      auto* oss_tensor = context.Output<framework::Tensor>("OutScales");
-      oss_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InScales")->place());
-      auto* oci_tensor = context.Output<framework::Tensor>("OutCurrentIter");
-      oci_tensor->mutable_data<T>(
-          context.Input<framework::Tensor>("InCurrentIter")->place());
-    }
+    auto* in_scale = context.Input<framework::Tensor>("InScale");

-    T scale = static_cast<T>(1);
-    int window_size = context.Attr<int>("window_size");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
    int bit_length = context.Attr<int>("bit_length");
    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();

-    auto& dev =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto raw_in = framework::EigenVector<T>::Flatten(*in);
-    if (quantize_type == std::string("abs_max")) {
-      auto* saving_scale = context.Output<framework::Tensor>("OutMovingScale");
-      auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-      scale_out.device(dev) = raw_in.abs().maximum();
-      scale = scale_out(0);
-
-      auto& device_ctx = context.template device_context<DeviceContext>();
-      auto* scale_list = context.Output<framework::Tensor>("OutScales");
-      math::SetConstant<DeviceContext, T> scalar;
-      scale_list->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, scale_list, static_cast<T>(0));
-      auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-      iter->mutable_data<T>(context.GetPlace());
-      scalar(device_ctx, iter, static_cast<T>(0));
-    } else if (quantize_type == std::string("range_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
-      if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* it = context.Input<framework::Tensor>("InCurrentIter");
-        auto* iter = context.Output<framework::Tensor>("OutCurrentIter");
-        const int* last_iter = it->data<int>();
-        int* current_iter = iter->mutable_data<int>(platform::CPUPlace());
-        auto* scale_list = context.Output<framework::Tensor>("OutScales");
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindRangeAbsMax(scale_list, saving_scale, scale, window_size,
-                                current_iter[0]);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-        (*current_iter) = (*last_iter) + 1;
-      }
-    } else if (quantize_type == std::string("moving_average_abs_max")) {
-      auto* moving_scale = context.Input<framework::Tensor>("InMovingScale");
+    // testing
    if (is_test) {
-        scale = moving_scale->data<T>()[0];
-      } else {
-        auto* saving_scale =
-            context.Output<framework::Tensor>("OutMovingScale");
-        auto scale_out = framework::EigenVector<T>::Flatten(*saving_scale);
-        scale_out.device(dev) = raw_in.abs().maximum();
-        scale = saving_scale->mutable_data<T>(platform::CPUPlace())[0];
-        scale = FindMovingAverageAbsMmax(
-            const_cast<framework::Tensor*>(moving_scale), saving_scale, scale);
-        saving_scale->mutable_data<T>(platform::CPUPlace())[0] = scale;
-      }
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
    }

-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), in->data<T>(),
-          in->data<T>() + in->numel(), tensor->mutable_data<T>(in->place()),
-          ClipFunctor<T>(-scale, scale));
-    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*tensor);
-    eigen_out.device(dev) = (bin_cnt / scale * eigen_in).round();
+    // training
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    auto* out_scales = context.Output<framework::Tensor>("OutScales");
+    auto* iter = context.Input<framework::Tensor>("Iter");
+
+    int window_size = context.Attr<int>("window_size");
+    out_scale->mutable_data<T>(context.GetPlace());
+
+    framework::Tensor cur_scale;
+    T* cur_scale_data = cur_scale.mutable_data<T>({1}, context.GetPlace());
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+    FindRangeAbsMaxFunctor<DeviceContext, T>()(dev_ctx, cur_scale, *in_scale,
+                                               *iter, window_size, out_scales,
+                                               out_scale);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
  }
 };


--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -157,6 +157,116 @@ class FlattenGradOp : public framework::OperatorBase {
  }
 };

+// FIXME(zcd): flatten2 adds an intermediate output(XShape) based on flatten,
+// the XShape is used to carry the shape and lod of X which will be used in
+// flatten_grad, in this way, the framework can reuse the memory of X
+// immediately the flatten2_op is finished.
+// Considering compatibility issues, we could not fix flatten2_op
+class Flatten2OpInferShape : public FlattenOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    FlattenOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output (XShape) of Flatten op should not be null.");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      xshape_dims[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", "XShape");
+  }
+};
+
+class Flatten2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Flatten2OpMaker : public FlattenOpMaker {
+ public:
+  void Make() override {
+    FlattenOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("flatten2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Flatten2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Flatten2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -167,3 +277,8 @@ REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
                  ops::FlattenOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
+
+REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
+                  ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker);
+REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp,
+                  ops::Flatten2GradInferShape);
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/fusion_gru_op.h"
+#include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Input(WeightH) of GRU should not be null.");

  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                 "Output(BatchResetHiddenPrev) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                 "Output(ReorderedH0) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                 "Output(BatchedInput) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                 "Output(BatchedOut) of GRU should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                 "Output(Hidden) of GRU should not be null.");

@@ -83,12 +80,16 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
  }
  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
+  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedOut", out_dims);
  ctx->ShareLoD("X", "Hidden");

-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
 }
@@ -115,22 +116,29 @@ void FusionGRUOpMaker::Make() {
           "(Tensor) The FC weight with shape (M x 3D),"
           "where M is the dim size of x, D is the hidden size. ");
  AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+           "Acutally they are D x 2D and D x D two part weights."
+           "{W_update, W_reset; W_state}"
+           "{D x (D + D); D x D}");
  AddInput("Bias",
           "(Tensor, optional) (1 x 3D)."
           "Almost same as GRUOp."
           "Note: if have FC bias it should be added on this bias.")
      .AsDispensable();
+  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+      .AsIntermediate();
  AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
            " or batched_X (size is T x M), this will be automatically chosen,"
            " where T is the total time steps in this mini-batch,"
            " D is the hidden size, M is the dim size of x input.")
      .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
-  AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
+  AddOutput("BatchedInput",
+            "(LoDTensor) This is the batched result of input X"
+            "or the batched result after fc, shape (T x 3D)")
      .AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
+  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
      .AsIntermediate();
  AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
  AddAttr<std::string>("activation",
@@ -146,6 +154,10 @@ void FusionGRUOpMaker::Make() {
                "(bool, defalut: False) "
                "whether to compute reversed GRU.")
      .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute GRU.")
+      .SetDefault(true);
  AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -153,172 +165,261 @@ more details can refer to GRU op.
 )DOC");
 }

-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
+template <typename T>
 class FusionGRUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* h0 = ctx.Input<Tensor>("H0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* batch_reset_hidden_prev =
-        ctx.Output<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#define INIT_VEC_FUNC                                                     \
+  std::function<void(const int, const T *, T *)> act_gate, act_state;     \
+  std::function<void(const int, const T*, const T*, const T*, T*)> cross; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");          \
+  auto& act_state_str = ctx.Attr<std::string>("activation");              \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                       \
+    math::VecActivations<T, platform::jit::avx> act_functor;              \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::avx>;                       \
+  } else {                                                                \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;          \
+    act_gate = act_functor(act_gate_str);                                 \
+    act_state = act_functor(act_state_str);                               \
+    cross = math::vec_cross<T, platform::jit::isa_any>;                   \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* wx = ctx.Input<Tensor>("WeightX");            \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
  bool is_reverse = ctx.Attr<bool>("is_reverse");

+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 3D*/ \
+  const int total_T = x_dims[0];         \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D3 = wh_dims[1];             \
+  const int D2 = D * 2;
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto x_lod = x->lod();
+    const int N = x_lod[0].size() - 1;
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    const T* wh_state_data = wh_data + D * D2;
    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
-    batch_hidden->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                      xx_data,
+                                      bias ? bias->data<T>() : nullptr);
+
+    int xx_offset = D3;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 3;
+      hidden_out_data = hidden_out_data + offset;
+      xx_offset = -D3;
+      gate_offset = -D;
+    }
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+    };
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_hidden_data = nullptr;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+      } else {
+        // W: {W_update, W_reset; W_state}
+        // update gate
+        act_gate(D, xx_data, xx_data);
+        // state gate
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = a*b
+        blas.VMUL(D, xx_data, xx_data + D2, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        tstart = 1;
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        // gemm prev * (Wu + Wr)
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D2, static_cast<T>(1), xx_data,
+                  D3);
+        act_gate(D2, xx_data, xx_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, prev_hidden_data, xx_data + D, hidden_out_data);
+
+        // gemm rt * Ws
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast<T>(1),
+                  hidden_out_data, D, wh_state_data, D, static_cast<T>(1),
+                  xx_data + D2, D3);
+        act_state(D, xx_data + D2, xx_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, xx_data, xx_data + D2, prev_hidden_data, hidden_out_data);
+        // save prev
+        prev_hidden_data = hidden_out_data;
+        move_step();
+      }
+    }
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    if (x->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");

    const T* x_data = x->data<T>();
    const T* wx_data = wx->data<T>();
    const T* wh_data = wh->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
+    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias ? bias->data<T>() : NULL);
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D3) {
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, x_data, wx_data,
+                                        xx_data,
+                                        bias ? bias->data<T>() : nullptr);
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
    } else {
      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
-                                        bias ? bias->data<T>() : NULL);
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, total_T, D3, M, xx_data, wx_data,
+                                        batched_input_data,
+                                        bias ? bias->data<T>() : nullptr);
    }

-    int frame_size = static_cast<int>(wx_dims[1] / 3);
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(wh_data);
-    gru_value.state_weight =
-        const_cast<T*>(wh_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});

+    int tstart = 0;
+    T* prev_hidden_data = nullptr;
    if (h0) {
-      ReorderInitState<DeviceContext, T>(
-          ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
-          true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
+      // reorder h0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      const T* h0_data = h0->data<T>();
+      prev_hidden_data = reordered_h0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
      }
-    auto batch_starts = batched_gate->lod()[0];
-    size_t seq_len = batch_starts.size() - 1;
-    auto active_node =
-        math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-
-#ifdef PADDLE_WITH_MKLML
-    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                       frame_size * 2 /*width of weight*/,
-                                       frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_gate);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
-                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
-                     packed_gate);
-      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                        frame_size /*width of weight*/,
-                                        frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_state);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
-                     frame_size, T(1.0), gru_value.state_weight, frame_size,
-                     packed_state);
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
-              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
-              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
+    } else {
+      // compute without h0
+      T* cur_in_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      // W: {W_update, W_reset; W_state}
+      for (int i = 0; i < max_bs; ++i) {
+        // update gate
+        act_gate(D, cur_in_data, cur_in_data);
+        // state gate
+        act_state(D, cur_in_data + D2, cur_in_data + D2);
+        // out = a*b
+        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        // add offset
+        cur_in_data += D3;
+        cur_out_data += D;
      }
-
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
-              gru_value.reset_output_value, frame_size, packed_state,
-              frame_size, T(1), gru_value.gate_value + frame_size * 2,
-              frame_size * 3);
+      tstart = 1;
+      prev_hidden_data = batched_out_data;
    }
-
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node);
-
-        gru_value.prev_out_value = gru_value.output_value;
+    // Then start from next
+    const T* wh_state_data = wh_data + D * D2;
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    batched_input_data = batched_input_data + tstart * max_bs * D3;
+    batched_out_data = batched_out_data + tstart * max_bs * D;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // gemm prev * (Wu + Wr)
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
+                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
+                batched_input_data, D3);
+
+      T* cur_batched_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      T* cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        act_gate(D2, cur_batched_data, cur_batched_data);
+        // rt = rt*ht_1 inplace result
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D, cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
      }

-      blas.GEMM_FREE(packed_gate);
-      blas.GEMM_FREE(packed_state);
-    } else {
-#endif
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
-            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate);
-
-        gru_value.prev_out_value = gru_value.output_value;
+      cur_batched_data = batched_input_data;
+      cur_out_data = batched_out_data;
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
+                cur_out_data, D, wh_state_data, D, static_cast<T>(1),
+                cur_batched_data + D2, D3);
+
+      cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        // ht~ = act_state(...)
+        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
+        // out = zt*ht~ + (1-zt)*ht_1
+        cross(D, cur_batched_data, cur_batched_data + D2, cur_prev_hidden_data,
+              cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
      }
-#ifdef PADDLE_WITH_MKLML
+      prev_hidden_data = batched_out_data;
+      batched_out_data = cur_out_data;
+      batched_input_data = cur_batched_data;
    }
-#endif
+
    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batched_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden_out);
+    batched_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_out, hidden_out);
  }
+#undef INIT_VEC_FUNC
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
 };

 }  // namespace operators
@@ -327,6 +428,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
-    fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
+                       ops::FusionGRUKernel<double>);
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -16,14 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"

-DEFINE_bool(seq_mode, true, "Use sequence mode");
-
 namespace paddle {
 namespace operators {

@@ -42,10 +38,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Output(Hidden) of LSTM should not be null.");
  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
                 "Output(Cell) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of LSTM should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
-                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                 "Output(BatchedInput) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                 "Output(BatchedHidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                 "Output(BatchedCell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                 "Output(ReorderedH0) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                 "Output(ReorderedC0) of LSTM should not be null.");

  auto x_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
@@ -87,23 +89,24 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_EQ(b_dims[0], 1,
                    "The first dimension of Input(Bias) should be 1.");

-  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
-                 "Do not support peephole yet.");
-  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+  auto use_peepholes = ctx->Attrs().Get<bool>("use_peepholes");
+  PADDLE_ENFORCE_EQ(b_dims[1], (use_peepholes ? 7 : 4) * frame_size,
                    "The second dimension of Input(Bias) should be "
-                    "4 * %d if disable peepholes connection",
-                    frame_size);
+                    "7 * %d if enable peepholes connection or"
+                    "4 * %d if disable peepholes",
+                    frame_size, frame_size);

  framework::DDim out_dims({x_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
  ctx->SetOutputDim("Cell", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedHidden", out_dims);
+  ctx->SetOutputDim("BatchedCell", out_dims);
  ctx->ShareLoD("X", "Hidden");
  ctx->ShareLoD("X", "Cell");

  int xx_width;
-  if (FLAGS_seq_mode) {
+  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
@@ -169,9 +172,11 @@ void FusionLSTMOpMaker::Make() {
            " where T is the total time steps in this mini-batch,"
            " D is the hidden size, M is the dim size of x input.")
      .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
-  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
-      .AsIntermediate();
+  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
  AddAttr<bool>("use_peepholes",
                "(bool, defalut: True) "
                "whether to enable diagonal/peephole connections.")
@@ -180,6 +185,10 @@ void FusionLSTMOpMaker::Make() {
                "(bool, defalut: False) "
                "whether to compute reversed LSTM.")
      .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute.")
+      .SetDefault(true);
  AddAttr<std::string>("gate_activation",
                       "(string, default: sigmoid)"
                       "The activation for input gate, forget gate and output "
@@ -203,70 +212,76 @@ This operator fuse the X into LSTM, more details can refer to LSTM op.
 )DOC");
 }

-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  // TODO(TJ): check mem copy perf
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
 template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
 public:
-  void SeqCompute(const framework::ExecutionContext& ctx) const {
-    using DeviceContext = paddle::platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* h0 = ctx.Input<Tensor>("H0");
-    auto* c0 = ctx.Input<Tensor>("C0");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+#define INIT_VEC_FUNC                                                          \
+  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  } else {                                                                     \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                          \
+  auto* x = ctx.Input<LoDTensor>("X");                  \
+  auto* h0 = ctx.Input<Tensor>("H0");                   \
+  auto* c0 = ctx.Input<Tensor>("C0");                   \
+  auto* wx = ctx.Input<Tensor>("WeightX");              \
+  auto* wh = ctx.Input<Tensor>("WeightH");              \
+  auto* bias = ctx.Input<Tensor>("Bias");               \
+  auto* xx = ctx.Output<LoDTensor>("XX");               \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");   \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");       \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes"); \
  bool is_reverse = ctx.Attr<bool>("is_reverse");

-    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
-    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
-    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
-    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::jit::MayIUse(platform::jit::avx)) {
-      math::VecActivations<T, platform::jit::avx> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    } else {
-      math::VecActivations<T, platform::jit::isa_any> act_functor;
-      act_gate = act_functor(act_gate_str);
-      act_cell = act_functor(act_cell_str);
-      act_cand = act_functor(act_cand_str);
-    }
+#define INIT_BASE_SIZES                  \
+  auto x_dims = x->dims();   /* T x M*/  \
+  auto wh_dims = wh->dims(); /* D x 4D*/ \
+  const int M = x_dims[1];               \
+  const int D = wh_dims[0];              \
+  const int D2 = D * 2;                  \
+  const int D3 = D * 3;                  \
+  const int D4 = wh_dims[1];
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC

    auto x_lod = x->lod();
-    auto x_dims = x->dims();    // T x M
-    auto wh_dims = wh->dims();  // D x 4D
    const int total_T = x_dims[0];
    const int N = x_lod[0].size() - 1;  // batch size
-    const int M = x_dims[1];            // x frame size
-    const int D = wh_dims[0];
-    const int D2 = D * 2;
-    const int D3 = D * 3;
-    const int D4 = wh_dims[1];

    const T* x_data = x->data<T>();
-    const T* h0_data = h0 ? h0->data<T>() : NULL;
-    const T* c0_data = c0 ? c0->data<T>() : NULL;
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* c0_data = c0 ? c0->data<T>() : nullptr;
+    const T* bias_data = bias->data<T>();
+    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
    const T* wx_data = wx->data<T>();
    const T* wh_data = wh->data<T>();
+
    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());

+    // use local variable
+    framework::DDim check_dims({3, D});
+    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
+    auto checked_cell_data =
+        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());
+
    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
                                      xx_data, bias->data<T>());
@@ -290,199 +305,319 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    for (int i = 0; i < N; ++i) {
      int bid = is_reverse ? N - 1 - i : i;
      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
-      const T* prev_cell_data = NULL;
-      const T* prev_hidden_data = NULL;
+      const T* prev_c_data = nullptr;
+      const T* prev_h_data = nullptr;
+
      int tstart = 0;
      if (h0_data) {
-        prev_hidden_data = h0_data + bid * D;
-        prev_cell_data = c0_data + bid * D;
+        prev_h_data = h0_data + bid * D;
+        prev_c_data = c0_data + bid * D;
      } else {
-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
+        // If step == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros. Then W_h * H_t-1 can be skipped
+
+        // ~C_t
        act_cand(D, xx_data, xx_data);
-        // cell out= input*tilde
+        if (use_peepholes) {
+          // I_t, F_t
+          act_gate(D2, xx_data + D, xx_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, xx_data + D, xx_data + D);
+        }
+        // C_t = I_t * ~C_t
        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
+          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
+          // O_t
+          act_gate(D, xx_data + D3, xx_data + D3);
+        }
+
        // hidden out= act_state(cellout) * outgate
        act_cell(D, cell_out_data, xx_data + D2);
+        // H_t = O_t * act_state(C_t)
        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);

        // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
-        tstart = 1;
+        prev_h_data = hidden_out_data;
+        prev_c_data = cell_out_data;

+        tstart = 1;
        move_step();
      }
+
      for (int step = tstart; step < seq_len; ++step) {
+        // + W_h * H_t-1
        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
-                  prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
-                  D4);
+                  prev_h_data, D, wh_data, D4, static_cast<T>(1), xx_data, D4);

-        // W_ch, W_ih, W_fh, W_oh
-        act_gate(D3, xx_data + D, xx_data + D);
+        // ~C_t
        act_cand(D, xx_data, xx_data);

-        // a = forget * prev_cell
-        blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
+        if (use_peepholes) {
+          // + W_ic|W_fc * C_t-1 for peephole connection
+          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+          blas.VADD(D2, xx_data + D, checked_cell_data, xx_data + D);
+          // I_t, F_t
+          act_gate(D2, xx_data + D, xx_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, xx_data + D, xx_data + D);
+        }

-        // b = input * tilde
+        // F_t * C_t-1
+        blas.VMUL(D, xx_data + D2, prev_c_data, xx_data + D2);
+        // I_t * ~C_t
        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
-
-        // cell out= a+b
+        // C_t = F_t * C_t-1 + I_t * ~C_t
        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);

+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cell_out_data, checked_cell_data + D2);
+          blas.VADD(D, xx_data + D3, checked_cell_data + D2, xx_data + D3);
+          // O_t
+          act_gate(D, xx_data + D3, xx_data + D3);
+        }
+
        // hidden out= act_state(cellout) * outgate
        act_cell(D, cell_out_data, xx_data + D2);
+        // H_t = O_t * act_state(C_t)
        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);

        // prev
-        prev_hidden_data = hidden_out_data;
-        prev_cell_data = cell_out_data;
+        prev_h_data = hidden_out_data;
+        prev_c_data = cell_out_data;

        move_step();
-      }
-    }
+      }  // for each step in batch
+    }    // for each batch
  }

  void BatchCompute(const framework::ExecutionContext& ctx) const {
    using DeviceContext = platform::CPUDeviceContext;
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* wx = ctx.Input<Tensor>("WeightX");
-    auto* wh = ctx.Input<Tensor>("WeightH");
-    auto* bias = ctx.Input<Tensor>("Bias");
-    auto* hidden_t0 = ctx.Input<Tensor>("H0");
-    auto* cell_t0 = ctx.Input<Tensor>("C0");
-
-    auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    auto* cell_out = ctx.Output<LoDTensor>("Cell");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    INIT_BASE_INPUT_OUTPUT
+    if (x->lod()[0].size() == 2) {  // batch size == 1
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC

-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
-    cell_out->mutable_data<T>(ctx.GetPlace());
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
+    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");

    const T* x_data = x->data<T>();
    const T* wx_data = wx->data<T>();
-    auto x_dims = x->dims();
-    auto wx_dims = wx->dims();
+    const T* wh_data = wh->data<T>();
+    const T* bias_data = bias->data<T>();
+    const T* wc_data = bias_data + D4;  // w_ic, w_fc, w_oc
+    auto place = ctx.GetPlace();
+    T* xx_data = xx->mutable_data<T>(place);
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
+    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
+    cell_out->mutable_data<T>(place);
+
+    // use local variable
+    framework::DDim check_dims({3, D});
+    Tensor checked_cell;  // w_ic * Ct-1, w_fc * Ct-1, w_oc * Ct
+    auto checked_cell_data =
+        checked_cell.mutable_data<T>(check_dims, ctx.GetPlace());

    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    if (x_dims[1] > wx_dims[1]) {
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        x_data, wx_data, xx_data,
-                                        bias->data<T>());
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    if (M > D4) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, x_data, wx_data,
+                                        xx_data, bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
    } else {
      to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
-      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
+      batched_input->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], D4, M, xx_data,
+                                        wx_data, batched_input_data,
                                        bias->data<T>());
    }

-    int frame_size = static_cast<int>(wx_dims[1] / 4);
-    framework::DDim out_dims({x_dims[0], frame_size});
-    math::LstmMetaValue<T> lstm_value;
-    // no peephole
-    lstm_value.check_ig = nullptr;
-    lstm_value.check_fg = nullptr;
-    lstm_value.check_og = nullptr;
-    lstm_value.prev_state_value = nullptr;
-    Tensor ordered_c0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
-
-    if (cell_t0) {
-      // Since the batch computing for LSTM reorders the input sequence
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
-                                         true);
-      lstm_value.prev_state_value = ordered_c0.data<T>();
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+    reordered_c0->Resize({max_bs, D});
+
+    T* prev_batch_h_data = nullptr;
+    T* prev_batch_c_data = nullptr;
+    T* cur_batch_in_data = batched_input_data;
+    T* cur_batch_h_out_data = batched_h_out_data;
+    T* cur_batch_c_out_data = batched_c_out_data;
+
+    auto move_step = [&](int bs) {
+      cur_batch_in_data += bs * D4;
+      cur_batch_c_out_data += bs * D;
+      cur_batch_h_out_data += bs * D;
+    };
+
+    int tstart = 0;
+    if (h0) {
+      // reorder h0, c0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
+      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
+      const T* h0_data = h0->data<T>();
+      const T* c0_data = c0->data<T>();
+      prev_batch_h_data = reordered_h0_data;
+      prev_batch_c_data = reordered_c0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+        reordered_c0_data += D;
      }
+    } else {
+      // Compute with no H0/C0
+      T* cur_in_data = cur_batch_in_data;
+      T* cur_c_out_data = cur_batch_c_out_data;
+      T* cur_h_out_data = cur_batch_h_out_data;
+
+      // If step == 0 and there is no initialized hidden state, that is to say
+      // the H0 is zeros. Then W_h * H_t-1 can be skiped

-    // Use the local variable as here.
-    LoDTensor batch_hidden, batch_cell;
-    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
-    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
-    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
-
-    auto batch_starts = batched_gate->lod()[0];
-    size_t max_seq_len = batch_starts.size() - 1;
-    auto gate_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-    auto cell_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("cell_activation"));
-    auto cand_act = math::detail::GetActivationType(
-        ctx.Attr<std::string>("candidate_activation"));
-
-    for (size_t n = 0; n < max_seq_len; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-
-      Tensor gate_t = batched_gate->Slice(bstart, bend);
-      Tensor out_t = batch_hidden.Slice(bstart, bend);
-      Tensor cell_t = batch_cell.Slice(bstart, bend);
-      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
-
-      int cur_batch_size = bend - bstart;
-
-      if (n > 0) {
-        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
-        int pre_h_end = pre_h_start + cur_batch_size;
-        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
-                    &gate_t, static_cast<T>(1.0));
-      } else if (hidden_t0) {
-        // TODO(TJ): move h0 outside for
-        // If n == 0 and there is no initialized hidden state, that is to say
-        // the H0 is zeros, the calculation W_h * H0 will be skiped.
-        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
-
-        // Since the batch computing for LSTM reorders the input sequence
-        // according to their length. The initialized hidden state also needs
-        // to reorder.
-        Tensor ordered_h0;
-        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
-                                           &ordered_h0, true);
-        // TODO(TJ): use gemm directly
-        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
-                    static_cast<T>(1.0));
+      for (int i = 0; i < max_bs; ++i) {  // iterate each data in 1st batch
+        // ~C_t
+        act_cand(D, cur_in_data, cur_in_data);
+
+        if (use_peepholes) {
+          // I_t, F_t
+          act_gate(D2, cur_in_data + D, cur_in_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, cur_in_data + D, cur_in_data + D);
        }

-      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.output_value = out_t.data<T>();
-      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.state_active_value = cell_pre_act_t.data<T>();
-      math::LstmUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
-          cand_act);
-      lstm_value.prev_state_value = lstm_value.state_value;
+        // C_t = I_t * ~C_t
+        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_c_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
+                    cur_in_data + D3);
+          // O_t
+          act_gate(D, cur_in_data + D3, cur_in_data + D3);
        }

-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden.set_lod(batched_gate->lod());
-    // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(dev_ctx, batch_hidden, hidden_out);
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cur_c_out_data, cur_in_data + D2);
+        // H_t = O_t * act_state(C_t)
+        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
+
+        // move to next data in the same batch
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+      }
+
+      // move to data for next timestep
+      prev_batch_h_data = cur_batch_h_out_data;
+      prev_batch_c_data = cur_batch_c_out_data;
+      move_step(max_bs);
+      tstart = 1;
+    }
+
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // + W_h * H_t-1
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D4, D, static_cast<T>(1),
+                prev_batch_h_data, D, wh_data, D4, static_cast<T>(1),
+                cur_batch_in_data, D4);
+
+      T* cur_in_data = cur_batch_in_data;
+      T* cur_c_out_data = cur_batch_c_out_data;
+      T* cur_h_out_data = cur_batch_h_out_data;
+      T* prev_c_data = prev_batch_c_data;  // NULL if no C0 in step0
+      T* prev_h_data = prev_batch_h_data;  // NULL if no H0 in step0
+      auto next_data_in_batch = [&]() {
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+        prev_c_data = prev_c_data ? prev_c_data + D : nullptr;
+        prev_h_data = prev_h_data ? prev_h_data + D : nullptr;
+      };
+
+      for (int i = 0; i < cur_bs; ++i) {  // iterate each data in same batch
+        // ~C_t
+        act_cand(D, cur_in_data, cur_in_data);
+
+        if (use_peepholes) {
+          // + W_ic|W_fc * C_t-1 for peephole connection
+          blas.VMUL(D, wc_data, prev_c_data, checked_cell_data);
+          blas.VMUL(D, wc_data + D, prev_c_data, checked_cell_data + D);
+          blas.VADD(D2, cur_in_data + D, checked_cell_data, cur_in_data + D);
+          // I_t, F_t
+          act_gate(D2, cur_in_data + D, cur_in_data + D);
+        } else {
+          // I_t, F_t, O_t
+          act_gate(D3, cur_in_data + D, cur_in_data + D);
+        }

-    batch_cell.set_lod(batched_gate->lod());
-    // restore the output cell state in LoDTensor from the batch cell
-    to_seq(dev_ctx, batch_cell, cell_out);
+        // F_t * C_t-1
+        blas.VMUL(D, cur_in_data + D2, prev_c_data, cur_in_data + D2);
+        // I_t * ~C_t
+        blas.VMUL(D, cur_in_data, cur_in_data + D, cur_in_data + D);
+        // C_t = F_t * C_t-1 + I_t * ~C_t
+        blas.VADD(D, cur_in_data + D, cur_in_data + D2, cur_c_out_data);
+
+        if (use_peepholes) {
+          // + W_oc * C_t for peephole connection
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, checked_cell_data + D2);
+          blas.VADD(D, cur_in_data + D3, checked_cell_data + D2,
+                    cur_in_data + D3);
+          // O_t
+          act_gate(D, cur_in_data + D3, cur_in_data + D3);
        }
+
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cur_c_out_data, cur_in_data + D2);
+        // H_t = O_t * act_state(C_t)
+        blas.VMUL(D, cur_in_data + D2, cur_in_data + D3, cur_h_out_data);
+
+        // move to next data in same batch
+        next_data_in_batch();
+      }
+      // move to data for next timestep
+      prev_batch_h_data = cur_batch_h_out_data;
+      prev_batch_c_data = cur_batch_c_out_data;
+      move_step(cur_bs);
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batched_h_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_h_out, hidden_out);
+    batched_c_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_c_out, cell_out);
+  }
+
  void Compute(const framework::ExecutionContext& ctx) const override {
-    if (FLAGS_seq_mode) {
+    if (ctx.Attr<bool>("use_seq")) {
      SeqCompute(ctx);
    } else {
      BatchCompute(ctx);
    }
  }
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_VEC_FUNC
 };

 }  // namespace operators

--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -92,12 +92,12 @@ class GRUUnitKernel : public framework::OpKernel<T> {
              gate_data, frame_size * 3);

    // calculate activited gate
-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
    ActCompute(context.Attr<int>("gate_activation"), place,
               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
    ActCompute(context.Attr<int>("gate_activation"), place,
               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
    auto r = g.slice(r_offsets, extents);  // reset gate
@@ -107,7 +107,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
              weight_data + frame_size * frame_size * 2, frame_size, 1,
              gate_data + frame_size * 2, frame_size * 3);

-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
    ActCompute(context.Attr<int>("activation"), place,
               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
    auto c = g.slice(c_offsets, extents);  // output candidate
@@ -171,12 +171,12 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    int batch_size = input->dims()[0];
    int frame_size = hidden_prev->dims()[1];

-    Eigen::array<int, 2> extents({{batch_size, frame_size}});
-    Eigen::array<int, 2> u_offsets({{0, 0}});
+    Eigen::array<int, 2> extents{{batch_size, frame_size}};
+    Eigen::array<int, 2> u_offsets{{0, 0}};
    auto u = g.slice(u_offsets, extents);  // update gate
-    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    Eigen::array<int, 2> r_offsets{{0, frame_size}};
    auto r = g.slice(r_offsets, extents);  // reset gate
-    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    Eigen::array<int, 2> c_offsets{{0, frame_size * 2}};
    auto c = g.slice(c_offsets, extents);  // output candidate

    // backward for unactivated update gate

--- a/paddle/fluid/operators/label_smooth_op.h
+++ b/paddle/fluid/operators/label_smooth_op.h
@@ -38,7 +38,8 @@ class LabelSmoothKernel : public framework::OpKernel<T> {
      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
      out.device(dev) =
          static_cast<T>(1 - epsilon) * in +
-          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+          static_cast<T>(epsilon) *
+              dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
    } else {
      out.device(dev) = static_cast<T>(1 - epsilon) * in +
                        static_cast<T>(epsilon / label_dim);

--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -67,27 +67,27 @@ template <typename T, int BlockDim>
 __global__ void LayerNormForward(const T *x, const T *scale, const T *bias,
                                 T *y, T *mean, T *var, float epsilon,
                                 int feature_size) {
-  using BlockReduce = cub::BlockReduce<PairForLayerNorm<T>, BlockDim>;
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<double>, BlockDim>;
  __shared__ typename BlockReduce::TempStorage temp_storage;

  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
  int end_idx = (blockIdx.x + 1) * feature_size;

  // Step 1: Reduce to calculate mean and var
-  T mean_val = static_cast<T>(0);
-  T var_val = static_cast<T>(0);
+  double mean_val = 0;
+  double var_val = 0;
  for (int i = beg_idx; i < end_idx; i += BlockDim) {
    T tmp = x[i];
    mean_val += tmp;
    var_val += (tmp * tmp);
  }
  auto pair = BlockReduce(temp_storage)
-                  .Reduce(PairForLayerNorm<T>(mean_val, var_val),
-                          PairForLayerNormAddFunctor<T>());
+                  .Reduce(PairForLayerNorm<double>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<double>());
  if (threadIdx.x == 0) {
    auto tmp = pair.first_ / feature_size;
-    mean[blockIdx.x] = tmp;
-    var[blockIdx.x] = pair.second_ / feature_size - tmp * tmp;
+    mean[blockIdx.x] = static_cast<T>(tmp);
+    var[blockIdx.x] = static_cast<T>(pair.second_ / feature_size - tmp * tmp);
  }
  __syncthreads();
  mean_val = mean[blockIdx.x];

--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -57,7 +57,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_LT(ids[i], row_number);
-          PADDLE_ENFORCE_GE(ids[i], 0);
+          PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i);
          memcpy(output + i * row_width, table + ids[i] * row_width,
                 row_width * sizeof(T));
        }

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
+if (NOT WIN32)
 add_subdirectory(detail)
+endif(NOT WIN32)

 function(math_library TARGET)
    # math_library is a function to create math library. 
@@ -38,9 +40,13 @@ math_library(context_project DEPS im2col math_function)
 math_library(cross_entropy)
 math_library(cos_sim_functor)
 math_library(depthwise_conv)
-math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
+
+if (NOT WIN32) # windows do not support avx functions yet.
+math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+endif (NOT WIN32)
+
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
@@ -51,7 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
 math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)


--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -132,6 +132,121 @@ inline void vec_scal<float, platform::jit::avx512_common>(const int n,
  vec_scal<float, platform::jit::avx2>(n, a, x, y);
 }

+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
+                                                    const float* x, float* y) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(a);
+  __m256 tmp;
+#define MOVE_ONE_STEP             \
+  tmp = _mm256_loadu_ps(x + i);   \
+  tmp = _mm256_sub_ps(bias, tmp); \
+  _mm256_storeu_ps(y + i, tmp)
+  for (i = 0; i < end; i += block) {
+    MOVE_ONE_STEP;
+  }
+#undef MOVE_ONE_STEP
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    y[i] = a - x[i];
+  }
+#else
+  vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y);
+#endif
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a,
+                                                     const float* x, float* y) {
+  vec_bias_sub<float, platform::jit::avx>(n, a, x, y);
+}
+
+template <>
+inline void vec_bias_sub<float, platform::jit::avx512_common>(const int n,
+                                                              const float a,
+                                                              const float* x,
+                                                              float* y) {
+  // TODO(TJ): enable me
+  vec_bias_sub<float, platform::jit::avx2>(n, a, x, y);
+}
+
+// out = x*y + (1-x)*z
+template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
+inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
+  for (int i = 0; i < n; ++i) {
+    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
+  }
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
+                                                 const float* y, const float* z,
+                                                 float* out) {
+#ifdef __AVX__
+  constexpr int block = AVX_FLOAT_BLOCK;
+  if (n < block) {
+    vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+    return;
+  }
+  const int rest = n % block;
+  const int end = n - rest;
+  int i = 0;
+  __m256 bias = _mm256_set1_ps(1.f);
+  __m256 tmpx, tmpy, tmpz;
+  for (i = 0; i < end; i += block) {
+    tmpx = _mm256_loadu_ps(x + i);
+    tmpy = _mm256_loadu_ps(y + i);
+    tmpz = _mm256_loadu_ps(z + i);
+    tmpy = _mm256_mul_ps(tmpx, tmpy);
+    tmpx = _mm256_sub_ps(bias, tmpx);
+    tmpz = _mm256_mul_ps(tmpx, tmpz);
+    tmpz = _mm256_add_ps(tmpy, tmpz);
+    _mm256_storeu_ps(out + i, tmpz);
+  }
+  if (rest == 0) {
+    return;
+  }
+  // can not continue move step if src and dst are inplace
+  for (i = n - rest; i < n; ++i) {
+    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
+  }
+#else
+  vec_cross<float, platform::jit::isa_any>(n, x, y, z, out);
+#endif
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x,
+                                                  const float* y,
+                                                  const float* z, float* out) {
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
+template <>
+inline void vec_cross<float, platform::jit::avx512_common>(
+    const int n, const float* x, const float* y, const float* z, float* out) {
+  // TODO(TJ): enable me
+  vec_cross<float, platform::jit::avx>(n, x, y, z, out);
+}
+
 template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
  for (int i = 0; i < n; ++i) {

--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif  // _WIN32
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -55,12 +60,38 @@ namespace math {
 *    FindLastSet(x) = 1 + \floor*{\log_{2}x}
 * \f]
 */
+#if !defined(_WIN32)
 inline constexpr size_t FindLastSet(size_t x) {
  return std::is_same<size_t, unsigned int>::value
             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
             : (std::is_same<size_t, unsigned long>::value  // NOLINT
                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+
+#else
+// windows don't have built-in clz, ctz function
+template <typename T>
+inline int ctz(const T& value) {
+  DWORD trailing_zero = 0;
+  if (_BitScanForward(&trailing_zero, value)) {
+    return static_cast<int>(trailing_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+template <typename T>
+inline int clz(const T& value) {
+  DWORD leadning_zero = 0;
+  if (_BitScanReverse(&leadning_zero, value)) {
+    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
+  } else {
+    return static_cast<int>(0);
+  }
+}
+
+inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
+#endif  // !_WIN32
 }

 struct SimpleCode {

--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -16,13 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"

 namespace paddle {
 namespace operators {
 namespace math {

-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
 class MaxOutFunctor {
 public:

--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,15 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"

 namespace paddle {
 namespace operators {
 namespace math {

-#define FLT_MAX \
-  __FLT_MAX__  // TODO(zcd) :It might need to be placed in another file, but I'm
-               // still wondering where to put it.
-
 /*
 * \brief Extracting simple operations from pooling.
 *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"

--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
    // Calculate the start position of each batch.
    // example:  sequences = {s0, s1, s2}
    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
+    //           max_seqlen = 5,
    //           batchIndex = {b0, b1, b2, b3, b4}
    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
    //               where 1 is the second sequence,
    //                     0 is the first sequence,
    //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
+    // The max_seqlen represents batch size after rearranging the
    // input LodTensor. It is also the maximum length of input sequence.

    paddle::framework::LoD batch_lods;
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
    batch_lods.emplace_back(std::vector<size_t>{0});

    // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    int max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
    // batch_lods[1] is the raw index in the input LoDTensor
    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
    // batch_lods[2] is the sort order for the input LoDTensor.
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
    size_t* batch_starts = batch_lods[0].data();
    size_t* seq2batch_idx = batch_lods[1].data();
    batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
+    for (int n = 0; n < max_seqlen; n++) {
      auto batch_id = static_cast<int>(batch_starts[n]);
      for (size_t i = 0; i < seq_info.size(); ++i) {
        int seq_len = seq_info[i].length;

--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -38,10 +38,9 @@ class PReluKernel : public framework::OpKernel<T> {
    auto dim = x->dims();
    int index = 0;
    int i = 0;
-    int temp = 0;
    if (mode == "channel") {
+      int temp = numel / (dim[0] * dim[1]);
      for (i = 0; i < numel; i++) {
-        temp = numel / (dim[0] * dim[1]);
        index = (i / temp) % dim[1];
        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
      }

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -246,6 +246,88 @@ class ReshapeGradKernel {
  }
 };

+// FIXME(zcd): reshape2 adds an intermediate output(XShape) based on reshape,
+// the XShape is used to carry the shape and lod of X which will be used in
+// reshape_grad, in this way, the framework can reuse the memory of X
+// immediately the reshape_op is finished.
+// Considering compatibility issues, we could not fix reshape_op
+class Reshape2Op : public ReshapeOp {
+ public:
+  Reshape2Op(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : ReshapeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ReshapeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of ReshapeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Reshape2OpMaker : public ReshapeOpMaker {
+ public:
+  void Make() override {
+    ReshapeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in FlattenGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reshape2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Reshape2GradOp : public framework::OperatorWithKernel {
+ public:
+  Reshape2GradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = ctx->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
@@ -261,6 +343,17 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                               ops::ReshapeGradKernel, int64_t,
                               ops::ReshapeGradKernel);

+REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
+                  ops::Reshape2GradMaker);
+REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
@@ -269,4 +362,11 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -36,9 +36,13 @@ class RmspropOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
                   "Output(param_out) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(Momentum_out) of RmspropOp should not be null.");
+                   "Output(MomentOut) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+    if (ctx->Attrs().Get<bool>("centered")) {
+      PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"),
+                     "Output(MeanGradOut) of RmspropOp should not be null.");
+    }

    auto param_dim = ctx->GetInputDim("Param");
    PADDLE_ENFORCE_EQ(
@@ -58,6 +62,9 @@ class RmspropOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ParamOut", param_dim);
    ctx->SetOutputDim("MomentOut", param_dim);
    ctx->SetOutputDim("MeanSquareOut", param_dim);
+    if (ctx->Attrs().Get<bool>("centered")) {
+      ctx->SetOutputDim("MeanGradOut", param_dim);
+    }
  }
 };

@@ -70,6 +77,10 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("MeanSquare",
             "(Tensor, default Tensor<float>)"
             " The mean square value that gets updated.");
+    AddInput("MeanGrad",
+             "(Tensor, default Tensor<float>)"
+             " The moving average of gradient")
+        .AsDispensable();
    AddInput("LearningRate",
             "(Tensor, default Tensor<float>) "
             "The learning rate should be a tensor of size 1.");
@@ -82,6 +93,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
    AddOutput("MomentOut", "(Tensor) Output updated moment.");
    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+    AddOutput("MeanGradOut",
+              "(Tensor) Output moving average of gradient updated value.");

    AddAttr<float>("epsilon",
                   "(float, default 1e-10) Constant "
@@ -93,6 +106,8 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(0.9f);
    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
        .SetDefault(0.0f);
+    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
+        .SetDefault(false);
    AddComment(R"DOC(
 Rmsprop Optimizer. 

@@ -103,6 +118,14 @@ MomentOut = momentum * Moment +
 ParamOut = Param -  MomentOut
 $$

+if centered is true:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+param -= mom
+
 The original slides that proposed Rmsprop: Slide 29 of
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)


--- a/paddle/fluid/operators/rmsprop_op.h
+++ b/paddle/fluid/operators/rmsprop_op.h
@@ -41,6 +41,7 @@ class RmspropOpKernel : public framework::OpKernel<T> {
    float epsilon = ctx.Attr<float>("epsilon");
    float rho = ctx.Attr<float>("decay");
    float momentum = ctx.Attr<float>("momentum");
+    bool centered = ctx.Attr<bool>("centered");

    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
@@ -53,12 +54,24 @@ class RmspropOpKernel : public framework::OpKernel<T> {
    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();

-    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+    Eigen::DSizes<int, 1> grad_dsize(static_cast<int>(grad->numel()));

    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    if (centered) {
+      auto mg = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanGrad"));
+      auto* mean_grad_out = ctx.Output<Tensor>("MeanGradOut");
+      mean_grad_out->mutable_data<T>(ctx.GetPlace());
+      auto mg_out = EigenVector<T>::Flatten(*mean_grad_out);
+
+      mg_out.device(place) = rho * mg + (1 - rho) * g;
+      mom_out.device(place) = momentum * mom +
+                              lr.broadcast(grad_dsize) * g /
+                                  (ms_out - mg_out.square() + epsilon).sqrt();
+    } else {
      mom_out.device(place) =
          momentum * mom +
          lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    }
    p_out.device(place) = p - mom_out;
  }
 };

--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -31,7 +31,7 @@ static inline int NumBlocks(const int N) {

 template <typename T>
 __global__ void GPUROIPoolForward(
-    const int nthreads, const T* input_data, const int64_t* input_rois,
+    const int nthreads, const T* input_data, const T* input_rois,
    const float spatial_scale, const int channels, const int height,
    const int width, const int pooled_height, const int pooled_width,
    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
@@ -43,7 +43,7 @@ __global__ void GPUROIPoolForward(
    int c = (i / pooled_width / pooled_height) % channels;
    int n = i / pooled_width / pooled_height / channels;

-    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    const T* offset_input_rois = input_rois + n * kROISize;
    int roi_batch_ind = roi_batch_id_data[n];
    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
@@ -93,7 +93,7 @@ __global__ void GPUROIPoolForward(

 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int nthreads, const T* input_rois, const T* output_grad,
    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
    const int channels, const int height, const int width,
    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
@@ -174,8 +174,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {

    GPUROIPoolForward<
        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
-        channels, height, width, pooled_height, pooled_width,
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale, channels,
+        height, width, pooled_height, pooled_width,
        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
        argmax->mutable_data<int64_t>(ctx.GetPlace()));
  }
@@ -228,7 +228,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
      if (output_grad_size > 0) {
        GPUROIPoolBackward<
            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            output_grad_size, rois->data<T>(), out_grad->data<T>(),
            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
            width, pooled_height, pooled_width,
            roi_batch_id_list_gpu.data<int>(),

--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -72,7 +72,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
    T* output_data = out->mutable_data<T>(ctx.GetPlace());
    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());

-    const int64_t* rois_data = rois->data<int64_t>();
+    const T* rois_data = rois->data<T>();
    for (int n = 0; n < rois_num; ++n) {
      int roi_batch_id = roi_batch_id_data[n];
      int roi_start_w = round(rois_data[0] * spatial_scale);
@@ -171,7 +171,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
        }
      }

-      const int64_t* rois_data = rois->data<int64_t>();
+      const T* rois_data = rois->data<T>();
      const T* out_grad_data = out_grad->data<T>();
      const int64_t* argmax_data = argmax->data<int64_t>();
      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
@@ -53,7 +53,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
        static_cast<T>(context.Attr<float>("min")),
        static_cast<T>(context.Attr<float>("max")));

-    std::vector<T> ids(batch_size);
+    std::vector<int64_t> ids(batch_size);
    for (int i = 0; i < batch_size; ++i) {
      T r = dist(engine);
      int idx = width - 1;
@@ -63,7 +63,7 @@ class SamplingIdKernel : public framework::OpKernel<T> {
          break;
        }
      }
-      ids[i] = ins_vector[idx];
+      ids[i] = int64_t(idx);
    }

    std::vector<int64_t> out_dim;

--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>
 #include <sstream>
@@ -23,40 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace operators {

-// TODO(sidgoyal78): These function are needed by other files (save_op), move
-// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveCombineOp : public framework::OperatorBase {
 public:
  SaveCombineOp(const std::string &type,

--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <stdint.h>
-#include <sys/stat.h>
 #include <fstream>
 #include <numeric>

@@ -25,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"

 namespace paddle {
 namespace operators {
@@ -33,36 +33,6 @@ namespace operators {
 // to directory specified.
 constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";

-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-constexpr char kSEP = '/';
-static bool FileExists(const std::string &filepath) {
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-}
-
-static std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-static void MkDir(const char *path) {
-  if (mkdir(path, 0755)) {
-    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
-  }
-}
-
-static void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
-
 class SaveOp : public framework::OperatorBase {
 public:
  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,

--- a/paddle/fluid/operators/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_enumerate_op.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEnumerateOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of SequecceEnumerate operator should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(X) of SequenceEnumerate operator should not be null.");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        x_dims.size(), 2UL,
+        "Input(X) of SequenceEnumerate operator's rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], 1UL,
+        "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
+
+    const auto win_size = ctx->Attrs().Get<int>("win_size");
+    ctx->SetOutputDim("Out", {x_dims[0], win_size});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dimension equal to 1) "
+             "Input LoDTensor of SequenceEnumerate operator.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dimension equal to win_size) "
+              "Output LoDTensor of SequenceEnumerate operator.");
+    AddAttr<int>("win_size", "(int) The enumerate sequence window size.")
+        .AddCustomChecker([](const int& win_size) {
+          PADDLE_ENFORCE(win_size >= 2,
+                         "The window size should be not less than 2.");
+        });
+    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Sequence Enumerate Operator.
+
+Generate a new sequence for the input index sequence, which enumerates all the
+sub-sequences with length `win_size` of the input. 
+The enumerated sequence has the same 1st dimension with variable `input`, and
+the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+Examples:
+Case 1:
+  Input:
+    X.lod = [[0, 3, 5]]
+    X.data = [[1], [2], [3], [4], [5]]
+    X.dims = [5, 1]
+  Attrs:
+    win_size = 2
+    pad_value = 0
+  Output:
+    Out.lod = [[0, 3, 5]]
+    Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+    Out.dims = [5, 2]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp,
+                             ops::SequenceEnumerateOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_enumerate,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEnumerateKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_enumerate_op.cu
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/fluid/operators/sequence_enumerate_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void CalcOutPut(const T* in_data, const size_t* in_lod,
+                           const size_t lod_len, const int64_t win_size,
+                           const int64_t pad_value, T* out_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_lod[lod_len - 1]) {
+    int end_idx = 0;
+    // Get LoD interval of index
+    for (int i = 1; i < lod_len; ++i) {
+      if (index < in_lod[i]) {
+        end_idx = in_lod[i];
+        break;
+      }
+    }
+    for (size_t i = 0; i < win_size; ++i) {
+      int word_pos = index + i;
+      out_data[index * win_size + i] =
+          word_pos < end_idx ? in_data[word_pos] : pad_value;
+    }
+  }
+}
+
+template <typename T>
+class SequenceEnumerateOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    /* Generate enumerate sequence set */
+    auto stream = context.cuda_device_context().stream();
+    auto lod0 = in_lod[0];
+    auto in_len = in->numel();
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    // Copy LoD to GPU
+    const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace());
+    // Calc output tensor
+    CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                 PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_data, dev_in_lod_ptr, lod0.size(), win_size, pad_value, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    sequence_enumerate,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int32_t>,
+    paddle::operators::SequenceEnumerateOpCUDAKernel<int64_t>);
--- a/paddle/fluid/operators/sequence_enumerate_op.h
+++ b/paddle/fluid/operators/sequence_enumerate_op.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceEnumerateKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int win_size = context.Attr<int>("win_size");
+    int pad_value = context.Attr<int>("pad_value");
+
+    auto in_dims = in->dims();
+    auto in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<uint64_t>(in_dims[0]), in_lod[0].back(),
+        "The actual input data's size mismatched with LoD information.");
+
+    // Generate enumerate sequence set
+    auto lod0 = in_lod[0];
+    auto in_data = in->data<T>();
+    auto out_data = out->mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      for (size_t idx = lod0[i]; idx < lod0[i + 1]; ++idx) {
+        for (int word_idx = 0; word_idx < win_size; ++word_idx) {
+          size_t word_pos = idx + word_idx;
+          out_data[win_size * idx + word_idx] =
+              word_pos < lod0[i + 1] ? in_data[word_pos] : pad_value;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -181,6 +181,113 @@ class SqueezeGradOp : public framework::OperatorBase {
  }
 };

+// FIXME(zcd): squeeze2 adds an intermediate output(XShape) based on squeeze,
+// the XShape is used to carry the shape and lod of X which will be used in
+// squeeze_grad, in this way, the framework can reuse the memory of X
+// immediately the squeeze2_op is finished.
+// Considering compatibility issues, we could not fix squeeze2_op
+class Squeeze2OpMaker : public SqueezeOpMaker {
+ public:
+  void Make() override {
+    SqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in SqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Squeeze2OpInferShape : public SqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    SqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Squeeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Squeeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("squeeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Squeeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Squeeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -192,3 +299,8 @@ REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                  ops::SqueezeOpInferShape,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
+
+REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
+                  ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker);
+REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
+                  ops::Squeeze2GradInferShape);
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/transpose_op.h"
+#include <string>
 #include <vector>

 namespace paddle {
@@ -24,7 +25,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
    auto x_dims = ctx->GetInputDim("X");
@@ -101,7 +102,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
@@ -113,6 +114,93 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  }
 };

+// FIXME(zcd): transpose2 adds an intermediate output(XShape) based on
+// transpose, the XShape is used to carry the shape and lod of X which
+// will be used in transpose_grad, in this way, the framework can reuse
+// the memory of X immediately the transpose2_op is finished.
+// Considering compatibility issues, we could not fix transpose2_op
+class Transpose2Op : public TransposeOp {
+ public:
+  Transpose2Op(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : TransposeOp(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    TransposeOp::InferShape(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) should not be null");
+    const auto &in_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> x_shape_dim(in_dims.size() + 1);
+    x_shape_dim[0] = 0;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      x_shape_dim[i + 1] = in_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(x_shape_dim));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class Transpose2OpMaker : public TransposeOpMaker {
+ public:
+  void Make() override {
+    TransposeOpMaker::Make();
+    AddOutput("XShape", "(Tensor)The output tensor.").AsIntermediate();
+  }
+};
+
+class Transpose2GradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("transpose2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Transpose2OpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto xshape_dim = ctx->GetInputDim("XShape");
+      auto x_shape_dim =
+          framework::slice_ddim(xshape_dim, 1, xshape_dim.size());
+      ctx->SetOutputDim(framework::GradVarName("X"), x_shape_dim);
+      ctx->ShareLoD("XShape", framework::GradVarName("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -120,8 +208,20 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
+
 REGISTER_OP_CPU_KERNEL(
    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
+                  ops::Transpose2GradMaker);
+REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
--- a/paddle/fluid/operators/transpose_op.cu.cc
+++ b/paddle/fluid/operators/transpose_op.cu.cc
@@ -21,3 +21,10 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
    transpose_grad,
    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    transpose2,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose2_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -168,6 +168,112 @@ class UnsqueezeGradOp : public framework::OperatorBase {
  }
 };

+// FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on
+// unsqueeze, the XShape is used to carry the shape and lod of X which
+// will be used in unsqueeze_grad, in this way, the framework can reuse
+// the memory of X immediately the unsqueeze2_op is finished.
+// Considering compatibility issues, we could not fix unsqueeze2_op
+class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    UnsqueezeOpInferShape::operator()(ctx);
+    PADDLE_ENFORCE(ctx->HasOutput("XShape"),
+                   "Output(XShape) of Unsqueeze operator should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+    xshape_dims[0] = 0;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      xshape_dims[i + 1] = x_dims[i];
+    }
+    ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims));
+    ctx->ShareLoD("X", /*->*/ "XShape");
+  }
+};
+
+class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
+ public:
+  void Make() override {
+    UnsqueezeOpMaker::Make();
+    AddOutput("XShape",
+              "XShape is just used to store the shape and lod of X, which will "
+              "be used in UnsqueezeGradOp.")
+        .AsIntermediate();
+  }
+};
+
+class Unsqueeze2Op : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    // Invoke Reshape op.
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("unsqueeze2_grad");
+    grad_op->SetInput("XShape", Output("XShape"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class Unsqueeze2GradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("XShape"),
+                   "Input(XShape) shouldn't be null.");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto xshape_dims = context->GetInputDim("XShape");
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+    context->SetOutputDim(framework::GradVarName("X"), x_dims);
+    context->ShareLoD("XShape", framework::GradVarName("X"));
+  }
+};
+
+class Unsqueeze2GradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto xshape_name = Input("XShape");
+    auto xshape_dims =
+        scope.FindVar(xshape_name)->Get<framework::LoDTensor>().dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape2", {{"X", {dout_name}}, {"Shape", {}}},
+        {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -180,3 +286,8 @@ REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                  ops::UnsqueezeGradInferShape);
+
+REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
+                  ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker);
+REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
+                  ops::Unsqueeze2GradInferShape);
--- a/paddle/fluid/platform/macros.h
+++ b/paddle/fluid/platform/macros.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <cfloat>

 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
@@ -23,3 +24,7 @@ limitations under the License. */
  classname& operator=(const classname&) = delete; \
  classname& operator=(classname&&) = delete
 #endif
+
+#if defined(__FLT_MAX__)
+#define FLT_MAX __FLT_MAX__
+#endif  // __FLT_MAX__
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -14,24 +14,141 @@

 #pragma once

+#include <cstdio>
 #include <stdexcept>
+
+#include <memory>
 #include <string>

+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "glog/logging.h"
+
 #if !defined(_WIN32)
-#include <dlfcn.h>     // for dladdr
-#include <execinfo.h>  // for backtrace
+#define UNUSED __attribute__((unused))
+#include <dlfcn.h>     //  dladdr
+#include <execinfo.h>  // backtrace
+#include <sys/stat.h>
+#include <algorithm>  // std::accumulate
 #else
-#include <Shlwapi.h>
-#include <Windows.h>
+#include <io.h>  // _popen, _pclose
+#include <windows.h>
+#if defined(_WIN32)
+#include <numeric>  // std::accumulate in msvc
+#endif
+// windows version of __attribute__((unused))
+#define UNUSED __pragma(warning(suppress : 4100))

-static void* dlsym(void* handle, const char* symbol_name) {
+#ifndef S_ISDIR  // windows port for sys/stat.h
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif  // S_ISDIR
+
+static void *dlsym(void *handle, const char *symbol_name) {
  FARPROC found_symbol;
  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);

  if (found_symbol == NULL) {
    throw std::runtime_error(std::string(symbol_name) + " not found.");
  }
-  return reinterpret_cast<void*>(found_symbol);
+  return reinterpret_cast<void *>(found_symbol);
 }

-#endif
+static void *dlopen(const char *filename, int flag) {
+  std::string file_name(filename);
+  file_name.replace(0, file_name.size() - 1, '/', '\\');
+  HMODULE hModule = LoadLibrary(file_name.c_str());
+  if (!hModule) {
+    throw std::runtime_error(file_name + " not found.");
+  }
+  return reinterpret_cast<void *>(hModule);
+}
+
+#endif  // !_WIN32
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+#if !defined(_WIN32)
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+#else
+  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
+#endif  // _WIN32
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+static bool PathExists(const std::string &path) {
+#if !defined(_WIN32)
+  struct stat statbuf;
+  if (stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#else
+  struct _stat statbuf;
+  if (_stat(path.c_str(), &statbuf) != -1) {
+    if (S_ISDIR(statbuf.st_mode)) {
+      return true;
+    }
+  }
+#endif  // !_WIN32
+  return false;
+}
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+#if !defined(_WIN32)
+constexpr char kSEP = '/';
+#else
+constexpr char kSEP = '\\';
+#endif  // _WIN32
+
+static bool FileExists(const std::string &filepath) {
+#if !defined(_WIN32)
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+#else
+  struct _stat buffer;
+  return (_stat(filepath.c_str(), &buffer) == 0);
+#endif  // !_WIN32
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  std::string path_error(path);
+  path_error += " mkdir failed!";
+#if !defined(_WIN32)
+  if (mkdir(path, 0755)) {
+    if (errno != EEXIST) {
+      throw std::runtime_error(path_error);
+    }
+  }
+#else
+  CreateDirectory(path, NULL);
+  auto errorno = GetLastError();
+  if (errorno != ERROR_ALREADY_EXISTS) {
+    throw std::runtime_error(path_error);
+  }
+#endif  // !_WIN32
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -115,6 +115,7 @@ function cmake_gen() {
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
    ========================================
@@ -144,6 +145,7 @@ EOF
        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
        -DPY_VERSION=${PY_VERSION:-2.7}
 }

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -98,10 +98,9 @@ class Inferencer(object):
            raise ValueError(
                "inputs should be a map of {'input_name': input_var}")

-        with executor.scope_guard(self.scope):
-            results = self.exe.run(self.inference_program,
-                                   feed=inputs,
-                                   fetch_list=[self.predict_var],
+        with self._prog_and_scope_guard():
+            results = self.exe.run(feed=inputs,
+                                   fetch_list=[self.predict_var.name],
                                   return_numpy=return_numpy)

        return results

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -145,26 +145,23 @@ def rpn_target_assign(loc,
    """

    helper = LayerHelper('rpn_target_assign', **locals())
-    # 1. Compute the regression target bboxes
-    target_bbox = box_coder(
-        prior_box=anchor_box,
-        prior_box_var=anchor_var,
-        target_box=gt_box,
-        code_type='encode_center_size',
-        box_normalized=False)
-    # 2. Compute overlaps between the prior boxes and the gt boxes overlaps
+    # Compute overlaps between the prior boxes and the gt boxes overlaps
    iou = iou_similarity(x=gt_box, y=anchor_box)
-    # 3. Assign target label to anchors
-    loc_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    score_index = helper.create_tmp_variable(dtype=anchor_box.dtype)
-    target_label = helper.create_tmp_variable(dtype=anchor_box.dtype)
+    # Assign target label to anchors
+    loc_index = helper.create_tmp_variable(dtype='int32')
+    score_index = helper.create_tmp_variable(dtype='int32')
+    target_label = helper.create_tmp_variable(dtype='int64')
+    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
    helper.append_op(
        type="rpn_target_assign",
-        inputs={'DistMat': iou},
+        inputs={'Anchor': anchor_box,
+                'GtBox': gt_box,
+                'DistMat': iou},
        outputs={
            'LocationIndex': loc_index,
            'ScoreIndex': score_index,
-            'TargetLabel': target_label
+            'TargetLabel': target_label,
+            'TargetBBox': target_bbox,
        },
        attrs={
            'rpn_batch_size_per_im': rpn_batch_size_per_im,
@@ -173,16 +170,16 @@ def rpn_target_assign(loc,
            'fg_fraction': fg_fraction
        })

-    # 4. Reshape and gather the target entry
-    scores = nn.reshape(x=scores, shape=(-1, 2))
-    loc = nn.reshape(x=loc, shape=(-1, 4))
-    target_label = nn.reshape(x=target_label, shape=(-1, 1))
-    target_bbox = nn.reshape(x=target_bbox, shape=(-1, 4))
+    loc_index.stop_gradient = True
+    score_index.stop_gradient = True
+    target_label.stop_gradient = True
+    target_bbox.stop_gradient = True

+    scores = nn.reshape(x=scores, shape=(-1, 1))
+    loc = nn.reshape(x=loc, shape=(-1, 4))
    predicted_scores = nn.gather(scores, score_index)
    predicted_location = nn.gather(loc, loc_index)
-    target_label = nn.gather(target_label, score_index)
-    target_bbox = nn.gather(target_bbox, loc_index)
+
    return predicted_scores, predicted_location, target_label, target_bbox



--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -78,7 +78,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
    return acc_out


-def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
+def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1):
    """
    **Area Under the Curve (AUC) Layer**

@@ -118,16 +118,14 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
    """
    helper = LayerHelper("auc", **locals())
    auc_out = helper.create_tmp_variable(dtype="float64")
+    batch_auc_out = helper.create_tmp_variable(dtype="float64")
    # make tp, tn, fp, fn persistable, so that can accumulate all batches.
-    tp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    tn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fp = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    fn = helper.create_global_variable(
-        persistable=True, dtype='int64', shape=[num_thresholds])
-    for var in [tp, tn, fp, fn]:
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[num_thresholds + 1])
+
+    for var in [stat_pos, stat_neg]:
        helper.set_variable_initializer(
            var, Constant(
                value=0.0, force_cpu=True))
@@ -137,18 +135,15 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
        inputs={
            "Predict": [input],
            "Label": [label],
-            "TP": [tp],
-            "TN": [tn],
-            "FP": [fp],
-            "FN": [fn]
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg]
        },
        attrs={"curve": curve,
               "num_thresholds": num_thresholds},
        outputs={
            "AUC": [auc_out],
-            "TPOut": [tp],
-            "TNOut": [tn],
-            "FPOut": [fp],
-            "FNOut": [fn]
+            "BatchAUC": [batch_auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg]
        })
-    return auc_out, [tp, tn, fp, fn]
+    return auc_out, batch_auc_out, [stat_pos, stat_neg]
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -111,6 +111,7 @@ __all__ = [
    'stack',
    'pad2d',
    'unstack',
+    'sequence_enumerate',
 ]


@@ -3545,11 +3546,6 @@ def topk(input, k, name=None):

            top5_values, top5_indices = layers.topk(input, k=5)
    """
-    shape = input.shape
-    if k < 1 or k >= shape[-1]:
-        raise ValueError("k must be greater than 0 and less than %d." %
-                         (shape[-1]))
-
    helper = LayerHelper("top_k", **locals())
    values = helper.create_tmp_variable(dtype=input.dtype)
    indices = helper.create_tmp_variable(dtype="int64")
@@ -4029,10 +4025,12 @@ def transpose(x, perm, name=None):

    helper = LayerHelper('transpose', **locals())
    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='transpose',
+        type='transpose2',
        inputs={'X': [x]},
-        outputs={'Out': [out]},
+        outputs={'Out': [out],
+                 'XShape': [x_shape]},
        attrs={'axis': perm})
    return out

@@ -4524,13 +4522,15 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
                "Each dimension size given in shape must not be negtive "
                "except one unknown dimension.")

-    helper = LayerHelper("reshape", **locals())
+    helper = LayerHelper("reshape2", **locals())
    out = helper.create_tmp_variable(dtype=x.dtype)
+    x_shape = helper.create_tmp_variable(dtype=x.dtype)
    helper.append_op(
-        type="reshape",
+        type="reshape2",
        inputs=inputs,
        attrs={"shape": shape},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})

    return helper.append_activation(out)

@@ -4574,11 +4574,13 @@ def squeeze(input, axes, name=None):
    """
    helper = LayerHelper("squeeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="squeeze",
+        type="squeeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})

    return out

@@ -4609,11 +4611,13 @@ def unsqueeze(input, axes, name=None):
    """
    helper = LayerHelper("unsqueeze", **locals())
    out = helper.create_tmp_variable(dtype=input.dtype)
+    x_shape = helper.create_tmp_variable(dtype=input.dtype)
    helper.append_op(
-        type="unsqueeze",
+        type="unsqueeze2",
        inputs={"X": input},
        attrs={"axes": axes},
-        outputs={"Out": out})
+        outputs={"Out": out,
+                 "XShape": x_shape})

    return out

@@ -5815,14 +5819,61 @@ def flatten(x, axis=1, name=None):
        raise ValueError("The axis should be a int, and in range [0, rank(x)]")

    out = helper.create_tmp_variable(x.dtype)
+    x_shape = helper.create_tmp_variable(x.dtype)
    helper.append_op(
-        type='flatten',
+        type='flatten2',
        inputs={"X": x},
-        outputs={'Out': out},
+        outputs={'Out': out,
+                 'XShape': x_shape},
        attrs={"axis": axis})
    return out


+def sequence_enumerate(input, win_size, pad_value=0, name=None):
+    """
+    Generate a new sequence for the input index sequence, which enumerates all the
+    sub-sequences with length `win_size` of the input. 
+    The enumerated sequence has the same 1st dimension with variable `input`, and
+    the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
+    
+    Examples:
+    Case 1:
+      Input:
+        X.lod = [[0, 3, 5]]
+        X.data = [[1], [2], [3], [4], [5]]
+        X.dims = [5, 1]
+      Attrs:
+        win_size = 2
+        pad_value = 0
+      Output:
+        Out.lod = [[0, 3, 5]]
+        Out.data = [[1, 2], [2, 3], [3, 0], [4, 5], [5, 0]]
+        Out.dims = [5, 2]
+
+    Args:
+        input (Variable): The input variable which is a index sequence.
+        win_size (int): The window size for enumerating all sub-sequences.
+        pad_value (int): The padding value, default 0.
+
+    Returns:
+        Variable: The enumerate sequence variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
+            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+    """
+    helper = LayerHelper('sequence_enumerate', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype(), stop_gradient=True)
+    helper.append_op(
+        type='sequence_enumerate',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'win_size': win_size,
+               'pad_value': pad_value})
+
+
 def sequence_mask(x, maxlen=None, dtype='int64', name=None):
    """
    **SequenceMask Layer**
@@ -5902,6 +5953,7 @@ def stack(x, axis=0):
    helper.append_op(
        type='stack', inputs={'X': x}, outputs={'Y': out},
        attrs={'axis': axis})
+
    return out



--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -558,8 +558,6 @@ class Auc(MetricBase):
        name: metric name
        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
          'PR' for the Precision-Recall-curve.
-        num_thresholds: The number of thresholds to use when discretizing the roc
-            curve.

    "NOTE: only implement the ROC curve type via Python now."

@@ -574,15 +572,14 @@ class Auc(MetricBase):
                numpy_auc = metric.eval()
    """

-    def __init__(self, name, curve='ROC', num_thresholds=200):
+    def __init__(self, name, curve='ROC', num_thresholds=4095):
        super(Auc, self).__init__(name=name)
        self._curve = curve
        self._num_thresholds = num_thresholds
-        self._epsilon = 1e-6
-        self.tp_list = np.zeros((num_thresholds, ))
-        self.fn_list = np.zeros((num_thresholds, ))
-        self.tn_list = np.zeros((num_thresholds, ))
-        self.fp_list = np.zeros((num_thresholds, ))
+
+        _num_pred_buckets = num_thresholds + 1
+        self._stat_pos = [0] * _num_pred_buckets
+        self._stat_neg = [0] * _num_pred_buckets

    def update(self, preds, labels):
        if not _is_numpy_(labels):
@@ -590,41 +587,32 @@ class Auc(MetricBase):
        if not _is_numpy_(preds):
            raise ValueError("The 'predictions' must be a numpy ndarray.")

-        kepsilon = 1e-7  # to account for floating point imprecisions
-        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
-                      for i in range(self._num_thresholds - 2)]
-        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
-
-        # calculate TP, FN, TN, FP count
-        for idx_thresh, thresh in enumerate(thresholds):
-            tp, fn, tn, fp = 0, 0, 0, 0
        for i, lbl in enumerate(labels):
+            value = preds[i, 1]
+            bin_idx = int(value * self._num_thresholds)
+            assert bin_idx <= self._num_thresholds
            if lbl:
-                    if preds[i, 1] >= thresh:
-                        tp += 1
-                    else:
-                        fn += 1
+                self._stat_pos[bin_idx] += 1.0
            else:
-                    if preds[i, 1] >= thresh:
-                        fp += 1
-                    else:
-                        tn += 1
-            self.tp_list[idx_thresh] += tp
-            self.fn_list[idx_thresh] += fn
-            self.tn_list[idx_thresh] += tn
-            self.fp_list[idx_thresh] += fp
+                self._stat_neg[bin_idx] += 1.0
+
+    @staticmethod
+    def trapezoid_area(x1, x2, y1, y2):
+        return abs(x1 - x2) * (y1 + y2) / 2.0

    def eval(self):
-        epsilon = self._epsilon
-        num_thresholds = self._num_thresholds
-        tpr = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fn_list + epsilon)
-        fpr = self.fp_list.astype("float32") / (
-            self.fp_list + self.tn_list + epsilon)
-        rec = (self.tp_list.astype("float32") + epsilon) / (
-            self.tp_list + self.fp_list + epsilon)
-
-        x = fpr[:num_thresholds - 1] - fpr[1:]
-        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
-        auc_value = np.sum(x * y)
-        return auc_value
+        tot_pos = 0.0
+        tot_neg = 0.0
+        auc = 0.0
+
+        idx = self._num_thresholds
+        while idx >= 0:
+            tot_pos_prev = tot_pos
+            tot_neg_prev = tot_neg
+            tot_pos += self._stat_pos[idx]
+            tot_neg += self._stat_neg[idx]
+            auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos,
+                                       tot_pos_prev)
+            idx -= 1
+
+        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -897,7 +897,20 @@ class RMSPropOptimizer(Optimizer):

        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2

-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    if centered is True:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
            \\epsilon}} \\nabla Q_{i}(w)

        w & = w - v(w, t)
@@ -915,6 +928,10 @@ class RMSPropOptimizer(Optimizer):
            avoid division by zero, set 1e-6 by default.
        momentum(float): :math:`\\beta` in equation is the momentum term,
            set 0.0 by default.
+        centered(bool): If True, gradients are normalized by the estimated variance of
+            the gradient; if False, by the uncentered second moment. Setting this to
+            True may help with training, but is slightly more expensive in terms of
+            computation and memory. Defaults to False.

    Raises:
        ValueError: If learning_rate, rho, epsilon, momentum are None.
@@ -928,12 +945,14 @@ class RMSPropOptimizer(Optimizer):

    _momentum_acc_str = "momentum"
    _mean_square_acc_str = "mean_square"
+    _mean_grad_acc_str = "mean_grad"

    def __init__(self,
                 learning_rate,
                 rho=0.95,
                 epsilon=1.0e-6,
                 momentum=0.0,
+                 centered=False,
                 **kwargs):
        super(RMSPropOptimizer, self).__init__(
            learning_rate=learning_rate, **kwargs)
@@ -950,6 +969,7 @@ class RMSPropOptimizer(Optimizer):
        self._rho = rho
        self._epsilon = epsilon
        self._momentum = momentum
+        self._centered = centered

    def _create_accumulators(self, block, parameters):
        if not isinstance(block, framework.Block):
@@ -958,6 +978,7 @@ class RMSPropOptimizer(Optimizer):
        for p in parameters:
            self._add_accumulator(self._momentum_acc_str, p)
            self._add_accumulator(self._mean_square_acc_str, p)
+            self._add_accumulator(self._mean_grad_acc_str, p)

    def _append_optimize_op(self, block, param_and_grad):
        if not isinstance(block, framework.Block):
@@ -967,6 +988,8 @@ class RMSPropOptimizer(Optimizer):
                                             param_and_grad[0])
        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
                                                param_and_grad[0])
+        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
+                                              param_and_grad[0])
        rmsprop_op = block.append_op(
            type=self.type,
            inputs={
@@ -974,17 +997,20 @@ class RMSPropOptimizer(Optimizer):
                "Grad": param_and_grad[1],
                "Moment": momentum_acc,
                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
                "LearningRate": self._create_param_lr(param_and_grad),
            },
            outputs={
                "ParamOut": param_and_grad[0],
                "MomentOut": momentum_acc,
-                "MeanSquareOut": mean_square_acc
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc
            },
            attrs={
                "epsilon": self._epsilon,
                "decay": self._rho,
-                "momentum": self._momentum
+                "momentum": self._momentum,
+                "centered": self._centered
            })

        return rmsprop_op

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -16,7 +16,9 @@ from __future__ import print_function

 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import os
 import cifar10_small_test_set


@@ -89,7 +91,7 @@ def optimizer_func():
    return fluid.optimizer.Adam(learning_rate=0.001)


-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
    BATCH_SIZE = 128
    EPOCH_NUM = 1

@@ -116,7 +118,10 @@ def train(use_cuda, train_program, params_dirname):

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_func=train_program, optimizer_func=optimizer_func, place=place)
+        train_func=train_program,
+        optimizer_func=optimizer_func,
+        place=place,
+        parallel=parallel)

    trainer.train(
        reader=train_reader,
@@ -125,10 +130,13 @@ def train(use_cuda, train_program, params_dirname):
        feed_order=['pixel', 'label'])


-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)

    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -139,22 +147,34 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results)


-def main(use_cuda):
+def main(use_cuda, parallel):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    save_path = "image_classification_resnet.inference.model"

+    os.environ['CPU_NUM'] = str(4)
    train(
        use_cuda=use_cuda,
        train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)

+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+
+    os.environ['CPU_NUM'] = str(1)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)


 if __name__ == '__main__':
    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -17,7 +17,9 @@ from __future__ import print_function
 import six
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import numpy
+import os
 import cifar10_small_test_set


@@ -69,7 +71,7 @@ def optimizer_func():
    return fluid.optimizer.Adam(learning_rate=0.001)


-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
    BATCH_SIZE = 128
    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -94,7 +96,10 @@ def train(use_cuda, train_program, params_dirname):

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)

    if six.PY2:
        trainer.train(
@@ -114,10 +119,13 @@ def train(use_cuda, train_program, params_dirname):
            assert ("kid scope" in cpt.get_exception_message(ex))


-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)

    # The input's dimension of conv should be 4-D or 5-D.
    # Use normilized image pixels as input data, which should be in the range
@@ -128,22 +136,31 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results)


-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
+def main(use_cuda, parallel):
    save_path = "image_classification_vgg.inference.model"

+    os.environ['CPU_NUM'] = str(4)
    train(
        use_cuda=use_cuda,
        train_program=train_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)

+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_network,
-        params_dirname=save_path)
+        params_dirname=save_path,
+        parallel=parallel)


 if __name__ == '__main__':
    for use_cuda in (False, True):
-        main(use_cuda=use_cuda)
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -64,14 +64,14 @@ def optimizer_func():
    return fluid.optimizer.Adam(learning_rate=0.001)


-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, parallel, params_dirname):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    trainer = fluid.Trainer(
        train_func=train_program,
        place=place,
        optimizer_func=optimizer_func,
-        parallel=True)
+        parallel=parallel)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
@@ -108,11 +108,14 @@ def train(use_cuda, train_program, params_dirname):
        feed_order=['img', 'label'])


-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)

    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -123,20 +126,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results[0])


-def main(use_cuda):
+def main(use_cuda, parallel):
    params_dirname = "recognize_digits_conv.inference.model"

    # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)


 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=core.is_compiled_with_cuda())
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -16,6 +16,7 @@ from __future__ import print_function

 import argparse
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import paddle
 import six
 import sys
@@ -51,11 +52,14 @@ def optimizer_func():
    return fluid.optimizer.Adam(learning_rate=0.001)


-def train(use_cuda, train_program, params_dirname):
+def train(use_cuda, train_program, params_dirname, parallel):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    trainer = fluid.Trainer(
-        train_func=train_program, place=place, optimizer_func=optimizer_func)
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=parallel)

    def event_handler(event):
        if isinstance(event, fluid.EndEpochEvent):
@@ -98,11 +102,14 @@ def train(use_cuda, train_program, params_dirname):
            assert ("kid scope" in cpt.get_exception_message(ex))


-def infer(use_cuda, inference_program, params_dirname=None):
+def infer(use_cuda, inference_program, parallel, params_dirname=None):
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    inferencer = fluid.Inferencer(
-        infer_func=inference_program, param_path=params_dirname, place=place)
+        infer_func=inference_program,
+        param_path=params_dirname,
+        place=place,
+        parallel=parallel)

    batch_size = 1
    tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -113,20 +120,32 @@ def infer(use_cuda, inference_program, params_dirname=None):
    print("infer results: ", results[0])


-def main(use_cuda):
+def main(use_cuda, parallel):
    params_dirname = "recognize_digits_mlp.inference.model"

    # call train() with is_local argument to run distributed train
+    os.environ['CPU_NUM'] = str(4)
    train(
        use_cuda=use_cuda,
        train_program=train_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)
+
+    # FIXME(zcd): in the inference stage, the number of
+    # input data is one, it is not appropriate to use parallel.
+    if parallel and use_cuda:
+        return
+    os.environ['CPU_NUM'] = str(1)
    infer(
        use_cuda=use_cuda,
        inference_program=inference_program,
-        params_dirname=params_dirname)
+        params_dirname=params_dirname,
+        parallel=parallel)


 if __name__ == '__main__':
-    # for use_cuda in (False, True):
-    main(use_cuda=False)
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            if use_cuda and not core.is_compiled_with_cuda():
+                continue
+            main(use_cuda=use_cuda, parallel=parallel)
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -281,7 +281,7 @@ class TestRpnTargetAssign(unittest.TestCase):
            gt_box = layers.data(
                name='gt_box', shape=[4], lod_level=1, dtype='float32')

-            predicted_scores, predicted_location, target_label, target_bbox = layers.rpn_target_assign(
+            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
                loc=loc,
                scores=scores,
                anchor_box=anchor_box,
@@ -292,15 +292,13 @@ class TestRpnTargetAssign(unittest.TestCase):
                rpn_positive_overlap=0.7,
                rpn_negative_overlap=0.3)

-            self.assertIsNotNone(predicted_scores)
-            self.assertIsNotNone(predicted_location)
-            self.assertIsNotNone(target_label)
-            self.assertIsNotNone(target_bbox)
-            assert predicted_scores.shape[1] == 2
-            assert predicted_location.shape[1] == 4
-            assert predicted_location.shape[1] == target_bbox.shape[1]
-
-        print(str(program))
+            self.assertIsNotNone(pred_scores)
+            self.assertIsNotNone(pred_loc)
+            self.assertIsNotNone(tgt_lbl)
+            self.assertIsNotNone(tgt_bbox)
+            assert pred_scores.shape[1] == 1
+            assert pred_loc.shape[1] == 4
+            assert pred_loc.shape[1] == tgt_bbox.shape[1]


 class TestGenerateProposals(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -36,6 +36,7 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.compat as cpt
 from paddle.compat import long_type

 import hashlib
@@ -315,7 +316,8 @@ def pad_batch_data(insts,
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
-    num_token = reduce(lambda x, y: x + y,
+    num_token = six.moves.reduce(
+        lambda x, y: x + y,
        [len(inst) for inst in insts]) if return_num_token else 0
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
@@ -328,7 +330,7 @@ def pad_batch_data(insts,
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
        inst_pos = np.array([
-            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            list(range(1, len(inst) + 1)) + [0] * (max_len - len(inst))
            for inst in insts
        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
@@ -385,10 +387,11 @@ def prepare_batch_input(insts, data_input_names, src_pad_idx, trg_pad_idx,
        return_num_token=True)

    data_input_dict = dict(
+        list(
            zip(data_input_names, [
                src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
                trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
-        ]))
+            ])))
    return data_input_dict, np.asarray([num_token], dtype="float32")


@@ -561,7 +564,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
    init = False
-    for pass_id in xrange(TrainTaskConfig.pass_num):
+    for pass_id in six.moves.xrange(TrainTaskConfig.pass_num):
        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
            if batch_id >= 5:
@@ -587,11 +590,11 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
                    ModelHyperParams.eos_idx, ModelHyperParams.n_head,
                    ModelHyperParams.d_model)
                total_num_token += num_token
-                feed_kv_pairs = data_input_dict.items()
+                feed_kv_pairs = list(data_input_dict.items())
                if TrainTaskConfig.local:
-                    feed_kv_pairs += {
+                    feed_kv_pairs += list({
                        lr_scheduler.learning_rate.name: lr_rate
-                    }.items()
+                    }.items())
                feed_list.append(dict(feed_kv_pairs))

                if not init:
@@ -873,6 +876,7 @@ class DataReader(object):

            f = tarfile.open(fpaths[0], "r")
            for line in f.extractfile(tar_fname):
+                line = cpt.to_text(line)
                fields = line.strip("\n").split(self._field_delimiter)
                if (not self._only_src and len(fields) == 2) or (
                        self._only_src and len(fields) == 1):
@@ -882,8 +886,9 @@ class DataReader(object):
                if not os.path.isfile(fpath):
                    raise IOError("Invalid file: %s" % fpath)

-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                    for line in f:
+                        line = cpt.to_text(line)
                        fields = line.strip("\n").split(self._field_delimiter)
                        if (not self._only_src and len(fields) == 2) or (
                                self._only_src and len(fields) == 1):
@@ -892,8 +897,9 @@ class DataReader(object):
    @staticmethod
    def load_dict(dict_path, reverse=False):
        word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
+                line = cpt.to_text(line)
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:
@@ -1034,7 +1040,7 @@ def multi_head_attention(queries,
        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
-            shape=map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(map(int, [0, 0, trans_x.shape[2] * trans_x.shape[3]])))

    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -249,7 +249,7 @@ class OpTest(unittest.TestCase):
        outs, _ = self._calc_output(place)
        return outs

-    def _calc_output(self, place, parallel=False):
+    def _calc_output(self, place, parallel=False, no_check_set=None):

        program = Program()
        block = program.global_block()
@@ -273,6 +273,8 @@ class OpTest(unittest.TestCase):
        # if not, fill the fetch_list by the user configured outputs in test.
        if len(fetch_list) == 0:
            for var_name, var in six.iteritems(outputs):
+                if no_check_set is not None and var_name in no_check_set:
+                    continue
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
@@ -291,11 +293,17 @@ class OpTest(unittest.TestCase):
                            return_numpy=False)
        return outs, fetch_list

-    def check_output_with_place(self, place, atol):
-        outs, fetch_list = self._calc_output(place)
+    def check_output_with_place(self,
+                                place,
+                                atol,
+                                no_check_set=None,
+                                equal_nan=False):
+        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
            if out_name not in self.outputs:
                continue
+            if no_check_set is not None and out_name in no_check_set:
+                continue

            def find_actual(target_name, fetch_list):
                found = [
@@ -321,7 +329,7 @@ class OpTest(unittest.TestCase):
                        if isinstance(expect, tuple) else expect
                    self.assertTrue(
                        np.allclose(
-                            actual_t, expect_t, atol=atol),
+                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                        "Output (" + sub_out_name + ") has diff at " +
                        str(place))
                    if isinstance(expect, tuple):
@@ -337,7 +345,7 @@ class OpTest(unittest.TestCase):
                expect_t = expect[0] if isinstance(expect, tuple) else expect
                self.assertTrue(
                    np.allclose(
-                        actual_t, expect_t, atol=atol),
+                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
                    str(actual_t))
@@ -360,10 +368,10 @@ class OpTest(unittest.TestCase):
            places.append(core.CUDAPlace(0))
        return places

-    def check_output(self, atol=1e-5):
+    def check_output(self, atol=1e-5, no_check_set=None, equal_nan=False):
        places = self._get_places()
        for place in places:
-            self.check_output_with_place(place, atol)
+            self.check_output_with_place(place, atol, no_check_set, equal_nan)

    def check_output_customized(self, checker):
        places = self._get_places()

--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
@@ -26,18 +26,15 @@ class TestAucOp(OpTest):
        pred = np.random.random((128, 2)).astype("float32")
        labels = np.random.randint(0, 2, (128, 1))
        num_thresholds = 200
-        tp = np.zeros((num_thresholds, )).astype("int64")
-        tn = np.zeros((num_thresholds, )).astype("int64")
-        fp = np.zeros((num_thresholds, )).astype("int64")
-        fn = np.zeros((num_thresholds, )).astype("int64")
+
+        stat_pos = np.zeros((num_thresholds + 1, )).astype("int64")
+        stat_neg = np.zeros((num_thresholds + 1, )).astype("int64")

        self.inputs = {
            'Predict': pred,
            'Label': labels,
-            'TP': tp,
-            'TN': tn,
-            'FP': fp,
-            'FN': fn
+            "StatPos": stat_pos,
+            "StatNeg": stat_neg
        }
        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}

@@ -47,11 +44,10 @@ class TestAucOp(OpTest):
        python_auc.update(pred, labels)

        self.outputs = {
-            'AUC': python_auc.eval(),
-            'TPOut': python_auc.tp_list,
-            'FNOut': python_auc.fn_list,
-            'TNOut': python_auc.tn_list,
-            'FPOut': python_auc.fp_list
+            'AUC': np.array(python_auc.eval()),
+            'BatchAUC': np.array(python_auc.eval()),
+            'StatPosOut': np.array(python_auc._stat_pos),
+            'StatNegOut': np.array(python_auc._stat_neg)
        }

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -438,7 +438,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
        # 2 optimize for table adam
        # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
        self.assertEqual([op.type for op in pserver1.blocks[2].ops],
-                         ["sum", "adam", "scale", "scale"])
+                         ["sum", "scale", "adam", "scale", "scale"])

        trainer, _ = self.get_trainer()
        self.assertEqual(len(trainer.blocks), 1)

--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -21,28 +21,41 @@ from op_test import OpTest

 class TestFakeQuantizeOp(OpTest):
    def setUp(self):
-        self.op_type = "fake_quantize"
+        self.op_type = "fake_quantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutScale': np.array(scale).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_range_abs_max"
        self.attrs = {
-            'bit_length': 8,
-            'quantize_type': 'abs_max',
-            'window_size': 10000
+            'bit_length': int(5),
+            'window_size': int(1),
+            'is_test': False
        }
        self.inputs = {
-            'X': np.random.random((10, 10)).astype("float32"),
-            'InScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'InCurrentIter': np.zeros(1).astype("float32"),
-            'InMovingScale': np.zeros(1).astype("float32")
-        }
-        self.scale = {
-            'abs_max': np.max(np.abs(self.inputs['X'])).astype("float32")
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'Iter': np.zeros(1).astype("int64"),
+            'InScale': np.zeros(1).astype("float32")
        }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        out_scales = np.zeros(self.attrs['window_size']).astype("float32")
+        out_scales[0] = scale
        self.outputs = {
-            'Out': np.round(self.inputs['X'] / self.scale['abs_max'] * (
+            'Out': np.round(self.inputs['X'] / scale * (
                (1 << (self.attrs['bit_length'] - 1)) - 1)),
-            'OutScales': np.zeros(self.attrs['window_size']).astype("float32"),
-            'OutMovingScale':
-            np.array([self.scale['abs_max']]).astype("float32"),
-            'OutCurrentIter': np.zeros(1).astype("float32")
+            'OutScale': scale,
+            'OutScales': out_scales,
        }

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -22,14 +22,17 @@ from op_test import OpTest

 class TestFlattenOp(OpTest):
    def setUp(self):
-        self.op_type = "flatten"
+        self.op_type = "flatten2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])

    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -37,7 +37,7 @@ def fusion_gru(
               h0,
               wh,
               np.zeros(
-                   (1, wh.shape[1]), dtype='float64'),
+                   (1, wh.shape[1]), dtype='float32'),
               is_reverse,
               act_state,
               act_gate)
@@ -62,15 +62,15 @@ class TestFusionGRUOp(OpTest):
        T = sum(self.lod[0])
        N = len(self.lod[0])

-        x = np.random.rand(T, self.M).astype('float64')
-        wx = np.random.rand(self.M, 3 * self.D).astype('float64')
-        wh = np.random.rand(self.D, 3 * self.D).astype('float64')
+        x = np.random.rand(T, self.M).astype('float32')
+        wx = np.random.rand(self.M, 3 * self.D).astype('float32')
+        wh = np.random.rand(self.D, 3 * self.D).astype('float32')
        bias = np.random.rand(
-            1, 3 * self.D).astype('float64') if self.with_bias else np.zeros(
-                (1, 3 * self.D), dtype='float64')
+            1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.D), dtype='float32')
        h0 = np.random.rand(
-            N, self.D).astype('float64') if self.with_h0 else np.zeros(
-                (N, self.D), dtype='float64')
+            N, self.D).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.D), dtype='float32')

        _, _, _, hidden = fusion_gru(
            x, self.lod, h0, wx, wh, bias, self.is_reverse,
@@ -93,7 +93,9 @@ class TestFusionGRUOp(OpTest):
        }

    def test_check_output(self):
-        self.check_output(atol=1e-8)
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output()


 class TestFusionGRUOpNoInitial(TestFusionGRUOp):

--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
        self.act_cell = 'tanh'
        self.act_cand = 'tanh'
        self.use_peepholes = False
+        self.use_seq = False
        self.set_conf()

        T = sum(self.lod[0])
@@ -107,6 +108,7 @@ class TestFusionLSTMOp(OpTest):
        }
        self.attrs = {
            'use_peepholes': self.use_peepholes,
+            'use_seq': self.use_seq,
            'is_reverse': self.is_reverse,
            'gate_activation': self.act_gate,
            'cell_activation': self.act_cell,
@@ -114,6 +116,8 @@ class TestFusionLSTMOp(OpTest):
        }

    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
            self.check_output()


@@ -157,5 +161,68 @@ class TestFusionLSTMOpBS1(TestFusionLSTMOp):
        self.D = 16


+class TestFusionLSTMOpPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpPoopholesBS1(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_peepholes = True
+        self.lod = [[3]]
+        self.D = 16
+
+
+class TestFusionLSTMOpSeqInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpSeqReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpSeqInitReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestFusionLSTMOpSeqPeepholes(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+
+
+class TestFusionLSTMOpSeqPeepholesInit(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+        self.has_initial_state = True
+
+
+class TestFusionLSTMOpSeqPeepholesReverse(TestFusionLSTMOp):
+    def set_conf(self):
+        self.use_seq = True
+        self.use_peepholes = True
+        self.is_reverse = True
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
@@ -177,8 +177,8 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):

    dx = (gt_ctr_x - ex_ctr_x) / ex_w / weights[0]
    dy = (gt_ctr_y - ex_ctr_y) / ex_h / weights[1]
-    dw = (np.log(gt_w / ex_w)) / ex_w / weights[2]
-    dh = (np.log(gt_h / ex_h)) / ex_h / weights[3]
+    dw = (np.log(gt_w / ex_w)) / weights[2]
+    dh = (np.log(gt_h / ex_h)) / weights[3]

    targets = np.vstack([dx, dy, dw, dh]).transpose()
    return targets

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -549,6 +549,13 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(out)
        print(str(program))

+    def test_sequence_enumerate(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="input", shape=[1], dtype='int32', lod_level=1)
+            out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function

 import unittest
 import numpy as np
+import six
 from op_test import OpTest


@@ -62,17 +63,20 @@ class PReluTest(OpTest):


 # TODO(minqiyang): Resume these test cases after fixing Python3 CI job issues
-#  class TestCase1(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "all"}
+if six.PY2:

-#  class TestCase2(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "channel"}
+    class TestCase1(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "all"}
+
+    class TestCase2(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "channel"}
+
+    class TestCase3(PReluTest):
+        def initTestCase(self):
+            self.attrs = {'mode': "element"}

-#  class TestCase3(PReluTest):
-#  def initTestCase(self):
-#  self.attrs = {'mode': "element"}

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -22,106 +22,39 @@ from op_test import OpTest

 class TestReshapeOp(OpTest):
    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpInplace(OpTest):
-    def setUp(self):
-        ori_shape = (2, 25)
-        new_shape = (5, 10)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInferInplace1(OpTest):
-    def setUp(self):
-        ori_shape = (5, 10)
-        new_shape = (5, -1, 5)
+        self.init_data()
+        self.op_type = "reshape2"
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }

-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+    def init_data(self):
+        self.ori_shape = (2, 25)
+        self.new_shape = (5, 10)
+        self.infered_shape = (5, 10)

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])

    def test_check_grad(self):
        self.check_grad(["X"], "Out")


-class TestReshapeOpDimInferInplace2(OpTest):
-    def setUp(self):
-        ori_shape = (2, 2, 6)
-        new_shape = (2, 0, 3, -1)
-        infered_shape = (2, 2, 3, -1)
-
-        self.op_type = "reshape"
-        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
-        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+class TestReshapeOpDimInfer1(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (5, 10)
+        self.new_shape = (5, -1, 5)
+        self.infered_shape = (5, -1, 5)

-    def test_check_output(self):
-        self.check_output()

-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+class TestReshapeOpDimInfer2(TestReshapeOp):
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)


 class TestReshapeOpWithInputShape(OpTest):
@@ -130,20 +63,23 @@ class TestReshapeOpWithInputShape(OpTest):
        new_shape = (0, -1, 5)
        actual_shape = (2, 3, 5)

-        self.op_type = "reshape"
+        self.op_type = "reshape2"
        self.inputs = {
            "X": np.random.random(ori_shape).astype("float32"),
            "Shape": np.array(
                actual_shape, dtype="int32")
        }
        self.attrs = {"shape": new_shape}
-        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(actual_shape),
+            'XShape': np.random.random(ori_shape).astype("float32")
+        }

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])

    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
+        self.check_grad(["X"], "Out", sum_outputs=["Out"])


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -15,90 +15,164 @@
 from __future__ import print_function

 import unittest
+
 import numpy as np
-from op_test import OpTest
-
-
-class TestRmspropOp1(OpTest):
-    ''' Test RMSProp with explicit inputs
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1e-6
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestRmspropOp2(OpTest):
-    '''Test RMSProp with default values for attributes
-    '''
-
-    def setUp(self):
-        self.op_type = "rmsprop"
-
-        param = np.random.random((123, 321)).astype("float32")
-        mean_square = np.random.random((123, 321)).astype("float32")
-        learning_rate = np.array([0.01]).astype("float32")
-        grad = np.random.random((123, 321)).astype("float32")
-        moment = np.zeros((123, 321)).astype("float32")
-
-        epsilon = 1.0e-10
-        decay = 0.9
-        momentum = 0.0
-
-        self.inputs = {
-            'Param': param,
-            'MeanSquare': mean_square,
-            'LearningRate': learning_rate,
-            'Grad': grad,
-            'Moment': moment,
-        }
-
-        ms_out = decay * mean_square + (1 - decay) * grad * grad
-        moment_out = momentum * moment + \
-            learning_rate * grad / np.sqrt(ms_out + epsilon)
-        param_out = param - moment_out
-
-        self.outputs = {
-            'ParamOut': param_out,
-            'MomentOut': moment_out,
-            'MeanSquareOut': ms_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+class TestBase(unittest.TestCase):
+    def setup(self, centered, epsilon=1e-6):
+        np.random.seed(5)  # fix seed
+
+        self.param_name = "param"
+        self.param = np.random.random((123, 321)).astype("float32")
+
+        self.mean_square_name = "mean_square"
+        self.mean_square = np.random.random((123, 321)).astype("float32")
+
+        self.mean_grad_name = "mean_grad"
+        self.mean_grad = np.random.random((123, 321)).astype("float32")
+
+        self.lr_name = "lr"
+        self.learning_rate = np.array([0.01]).astype("float32")
+
+        self.grad_name = "grad"
+        self.grad = np.random.random((123, 321)).astype("float32")
+
+        self.moment_name = "moment"
+        self.moment = np.zeros((123, 321)).astype("float32")
+
+        self.epsilon = epsilon
+        self.decay = 0.9
+        self.momentum = 0.0
+        self.centered = centered
+
+        self.ms_out = self.decay * self.mean_square + (1 - self.decay
+                                                       ) * self.grad * self.grad
+        if centered:
+            self.mg_out = self.decay * self.mean_grad + (1 - self.decay
+                                                         ) * self.grad
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon)
+        else:
+            self.moment_out = self.momentum * self.moment + \
+                              self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon)
+
+        self.param_out = self.param - self.moment_out
+
+    def check(self,
+              actual_t,
+              expect_t,
+              place,
+              out_name,
+              atol=1e-5,
+              equal_nan=False):
+        self.assertTrue(
+            np.allclose(
+                actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+            "Output (" + out_name + ") has diff at " + str(place) + "\nExpect "
+            + str(expect_t) + "\n" + "But Got" + str(actual_t))
+
+
+class TestRmspropOp(TestBase):
+    def check_with_place(self, place, centered, epsilon):
+        self.setup(centered, epsilon)
+        scope = core.Scope()
+
+        # create and initialize Param Variable
+        param = scope.var(self.param_name).get_tensor()
+        param.set(self.param, place)
+
+        mean_square = scope.var(self.mean_square_name).get_tensor()
+        mean_square.set(self.mean_square, place)
+
+        lr = scope.var(self.lr_name).get_tensor()
+        lr.set(self.learning_rate, place)
+
+        grad = scope.var(self.grad_name).get_tensor()
+        grad.set(self.grad, place)
+
+        moment = scope.var(self.moment_name).get_tensor()
+        moment.set(self.moment, place)
+
+        # create and run sgd operator
+
+        if self.centered:
+            mean_grad = scope.var(self.mean_grad_name).get_tensor()
+            mean_grad.set(self.mean_grad, place)
+
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                MeanGrad=self.mean_grad_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                MeanGradOut=self.mean_grad_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=True)
+        else:
+            rmsprop_op = Operator(
+                "rmsprop",
+                Param=self.param_name,
+                Grad=self.grad_name,
+                MeanSquare=self.mean_square_name,
+                Moment=self.moment_name,
+                LearningRate=self.lr_name,
+                ParamOut=self.param_name,
+                MeanSquareOut=self.mean_square_name,
+                MomentOut=self.moment_name,
+                epsilon=self.epsilon,
+                decay=self.decay,
+                momentum=self.momentum,
+                centered=False)
+
+        rmsprop_op.run(scope, place)
+
+        atol = 1e-5
+        equal_nan = False
+
+        if self.centered:
+            atol = 1e-3
+            equal_nan = True
+
+        self.check(
+            np.array(mean_square), self.ms_out, place, self.mean_square_name)
+        self.check(
+            np.array(moment),
+            self.moment_out,
+            place,
+            self.moment_name,
+            atol=atol,
+            equal_nan=equal_nan)
+        self.check(
+            np.array(param),
+            self.param_out,
+            place,
+            self.param_name,
+            atol=atol,
+            equal_nan=equal_nan)
+
+        if self.centered:
+            self.check(
+                np.array(mean_grad), self.mg_out, place, self.mean_grad_name)
+
+    def test_rmsprop(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place, False, 1e-6)
+            self.check_with_place(place, False, 1e-10)
+            self.check_with_place(place, True, 1e-6)
+            self.check_with_place(place, True, 1e-10)


 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -61,7 +61,7 @@ class TestROIPoolOp(OpTest):

        for i in range(self.rois_num):
            roi = self.rois[i]
-            roi_batch_id = roi[0]
+            roi_batch_id = int(roi[0])
            roi_start_w = int(cpt.round(roi[1] * self.spatial_scale))
            roi_start_h = int(cpt.round(roi[2] * self.spatial_scale))
            roi_end_w = int(cpt.round(roi[3] * self.spatial_scale))
@@ -125,7 +125,7 @@ class TestROIPoolOp(OpTest):
                roi = [bno, x1, y1, x2, y2]
                rois.append(roi)
        self.rois_num = len(rois)
-        self.rois = np.array(rois).astype("int64")
+        self.rois = np.array(rois).astype("float32")

    def setUp(self):
        self.op_type = "roi_pool"

--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -18,12 +18,17 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
+from test_anchor_generator_op import anchor_generator_in_python
+from test_generate_proposal_labels import _generate_groundtruth
+from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta


-def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,
-                      rpn_negative_overlap, fg_fraction):
-    iou = np.transpose(iou)
+def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
+                      rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+    iou = np.transpose(gt_anchor_iou)
    anchor_to_gt_max = iou.max(axis=1)
+    anchor_to_gt_argmax = iou.argmax(axis=1)
+
    gt_to_anchor_argmax = iou.argmax(axis=0)
    gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
    anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
@@ -42,59 +47,113 @@ def rpn_target_assign(iou, rpn_batch_size_per_im, rpn_positive_overlap,

    num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
+    tgt_lbl[bg_inds] = 0
    if len(bg_inds) > num_bg:
        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
        tgt_lbl[enable_inds] = 0
    bg_inds = np.where(tgt_lbl == 0)[0]
+    tgt_lbl[bg_inds] = 0

    loc_index = fg_inds
    score_index = np.hstack((fg_inds, bg_inds))
    tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
-    return loc_index, score_index, tgt_lbl
+
+    gt_inds = anchor_to_gt_argmax[fg_inds]
+
+    return loc_index, score_index, tgt_lbl, gt_inds
+
+
+def get_anchor(n, c, h, w):
+    input_feat = np.random.random((n, c, h, w)).astype('float32')
+    anchors, _ = anchor_generator_in_python(
+        input_feat=input_feat,
+        anchor_sizes=[32., 64.],
+        aspect_ratios=[0.5, 1.0],
+        variances=[1.0, 1.0, 1.0, 1.0],
+        stride=[16.0, 16.0],
+        offset=0.5)
+    return anchors
+
+
+def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
+             rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
+
+    loc_indexes = []
+    score_indexes = []
+    tmp_tgt_labels = []
+    tgt_bboxes = []
+    anchor_num = anchor.shape[0]
+
+    batch_size = len(lod) - 1
+    for i in range(batch_size):
+        b, e = lod[i], lod[i + 1]
+        iou_slice = iou[b:e, :]
+        bboxes_slice = gt_boxes[b:e, :]
+
+        loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
+            iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, fg_fraction)
+
+        fg_bboxes = bboxes_slice[gt_inds]
+        fg_anchors = anchor[loc_idx]
+        box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
+
+        if i == 0:
+            loc_indexes = loc_idx
+            score_indexes = score_idx
+            tmp_tgt_labels = tgt_lbl
+            tgt_bboxes = box_deltas
+        else:
+            loc_indexes = np.concatenate(
+                [loc_indexes, loc_idx + i * anchor_num])
+            score_indexes = np.concatenate(
+                [score_indexes, score_idx + i * anchor_num])
+            tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
+            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
+
+    tgt_labels = tmp_tgt_labels[score_indexes]
+    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels


 class TestRpnTargetAssignOp(OpTest):
    def setUp(self):
-        iou = np.random.random((10, 8)).astype("float32")
-        self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
-        self.attrs = {
-            'rpn_batch_size_per_im': 256,
-            'rpn_positive_overlap': 0.95,
-            'rpn_negative_overlap': 0.3,
-            'fg_fraction': 0.25,
-            'fix_seed': True
-        }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 256, 0.95, 0.3,
-                                                            0.25)
-        self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
-        }
+        n, c, h, w = 2, 4, 14, 14
+        anchor = get_anchor(n, c, h, w)
+        gt_num = 10
+        anchor = anchor.reshape(-1, 4)
+        anchor_num = anchor.shape[0]

-    def test_check_output(self):
-        self.check_output()
+        im_shapes = [[64, 64], [64, 64]]
+        gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
+        bbox = np.vstack([v['boxes'] for v in gt_box])

+        iou = _bbox_overlaps(bbox, anchor)
+
+        anchor = anchor.astype('float32')
+        bbox = bbox.astype('float32')
+        iou = iou.astype('float32')
+
+        loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
+            anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)

-class TestRpnTargetAssignOp2(OpTest):
-    def setUp(self):
-        iou = np.random.random((10, 20)).astype("float32")
        self.op_type = "rpn_target_assign"
-        self.inputs = {'DistMat': iou}
+        self.inputs = {
+            'Anchor': anchor,
+            'GtBox': (bbox, [[4, 4]]),
+            'DistMat': (iou, [[4, 4]]),
+        }
        self.attrs = {
-            'rpn_batch_size_per_im': 128,
-            'rpn_positive_overlap': 0.5,
-            'rpn_negative_overlap': 0.5,
-            'fg_fraction': 0.5,
+            'rpn_batch_size_per_im': 25600,
+            'rpn_positive_overlap': 0.95,
+            'rpn_negative_overlap': 0.03,
+            'fg_fraction': 0.25,
            'fix_seed': True
        }
-        loc_index, score_index, tgt_lbl = rpn_target_assign(iou, 128, 0.5, 0.5,
-                                                            0.5)
        self.outputs = {
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': tgt_lbl,
+            'LocationIndex': loc_index.astype('int32'),
+            'ScoreIndex': score_index.astype('int32'),
+            'TargetBBox': tgt_bbox.astype('float32'),
+            'TargetLabel': tgt_lbl.astype('int64'),
        }

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -25,9 +25,9 @@ class TestSamplingIdOp(OpTest):
        self.op_type = "sampling_id"
        self.use_mkldnn = False
        self.init_kernel_type()
-        self.X = np.random.random((8, 4)).astype('float32')
+        self.X = np.random.random((100, 10)).astype('float32')
        self.inputs = {"X": self.X}
-        self.Y = np.random.random(8).astype('float32')
+        self.Y = np.random.random(100).astype('int64')
        self.outputs = {'Out': self.Y}
        self.attrs = {'max': 1.0, 'min': 0.0, 'seed': 1}

@@ -36,6 +36,16 @@ class TestSamplingIdOp(OpTest):
        y1 = self.out
        self.check_output_customized(self.verify_output)
        y2 = self.out
+
+        # check dtype
+        assert y1.dtype == np.int64
+        assert y2.dtype == np.int64
+
+        # check output is index ids of inputs
+        inputs_ids = np.arange(self.X.shape[1])
+        assert np.isin(y1, inputs_ids).all()
+        assert np.isin(y2, inputs_ids).all()
+
        self.assertTrue(np.array_equal(y1, y2))
        self.assertEqual(len(y1), len(self.Y))


--- a/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_enumerate(input_seq, in_lod, win_size, pad_value):
+    lod0 = [0]
+    for i in range(0, len(in_lod[0])):
+        lod0.append(lod0[i] + in_lod[0][i])
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        for idx in range(lod0[i], lod0[i + 1]):
+            single_seq = []
+            for word_idx in range(win_size):
+                word_pos = idx + word_idx
+                dat = input_seq[word_pos] if word_pos < lod0[i+1] \
+                    else pad_value
+                single_seq.append(dat)
+            out_seq.append(single_seq)
+    return out_seq
+
+
+class TestSequenceEnumerateOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_enumerate"
+        self.init_test_case()
+        self.inputs = {'X': (self.in_seq, self.lod)}
+        self.attrs = {'win_size': self.win_size, 'pad_value': self.pad_value}
+        self.outputs = {'Out': (self.out_seq, self.lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TesSequenceEnumerateOpInt64(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 2
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int64")
+
+
+class TestSequenceEnumerateOpLargeWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpMaxWinSize(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 30
+        self.pad_value = 0
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+class TestSequenceEnumerateOpLargePadValue(TestSequenceEnumerateOp):
+    def init_test_case(self):
+        self.in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        self.lod = [[9, 4, 11, 6]]
+        self.win_size = 5
+        self.pad_value = 5
+        out_seq = sequence_enumerate(self.in_seq, self.lod, self.win_size,
+                                     self.pad_value)
+        self.out_seq = np.array(out_seq).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -23,14 +23,17 @@ from op_test import OpTest
 # Correct: General.
 class TestSqueezeOp(OpTest):
    def setUp(self):
-        self.op_type = "squeeze"
+        self.op_type = "squeeze2"
        self.init_test_case()
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])

    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -22,16 +22,19 @@ from op_test import OpTest
 class TestTransposeOp(OpTest):
    def setUp(self):
        self.initTestCase()
-        self.op_type = "transpose"
+        self.op_type = "transpose2"
        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
        self.attrs = {'axis': list(self.axis)}
-        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype("float32"),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=['XShape'])

    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', sum_outputs=['Out'])

    def initTestCase(self):
        self.shape = (3, 4)

--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -24,13 +24,16 @@ from op_test import OpTest
 class TestUnsqueezeOp(OpTest):
    def setUp(self):
        self.init_test_case()
-        self.op_type = "unsqueeze"
+        self.op_type = "unsqueeze2"
        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
        self.init_attrs()
-        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.ori_shape).astype("float32")
+        }

    def test_check_output(self):
-        self.check_output()
+        self.check_output(no_check_set=["XShape"])

    def test_check_grad(self):
        self.check_grad(["X"], "Out")

--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -153,7 +153,7 @@ def block_to_code(block, block_idx):

    indent += 1
    # sort all vars
-    all_vars = sorted(block.vars.iteritems(), key=lambda x: x[0])
+    all_vars = sorted(six.iteritems(block.vars), key=lambda x: x[0])
    for var in all_vars:
        print("{}{}".format(get_indent_space(indent), variable_to_code(var[1])))


--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -300,7 +300,7 @@ class DistributeTranspiler(object):
            input_deps = grad_name_to_send_dummy_out.values()
            program.global_block().append_op(
                type="send_barrier",
-                inputs={"X": input_deps},
+                inputs={"X": list(input_deps)},
                outputs={"Out": send_barrier_out},
                attrs={
                    "endpoints": pserver_endpoints,
@@ -455,7 +455,7 @@ class DistributeTranspiler(object):
            if len(splited_var) <= 1:
                continue
            # NOTE: if enable memory optimization, origin vars maybe removed.
-            if startup_program.global_block().vars.has_key(varname):
+            if varname in startup_program.global_block().vars:
                orig_param = startup_program.global_block().vars[varname]
            else:
                origin_param_var = self.origin_program.global_block().vars[
@@ -1096,7 +1096,8 @@ class DistributeTranspiler(object):
            self.table_name]

        zero_dim = int(
-            math.ceil(origin_param_var.shape[0] / len(self.pserver_endpoints)))
+            math.ceil(origin_param_var.shape[0] / float(
+                len(self.pserver_endpoints))))
        table_shape = list(origin_param_var.shape)
        table_shape[0] = zero_dim

@@ -1390,8 +1391,6 @@ class DistributeTranspiler(object):
                inputs={"X": vars2merge},
                outputs={"Out": merged_var},
                attrs={"use_mkldnn": False})
-            # TODO(panyx0718): What if it's SELECTED_ROWS.
-            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
            optimize_block.append_op(
                type="scale",
                inputs={"X": merged_var},