diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..a631ad14b18310598f7eea3a51839d61a9e456ff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
deleted file mode 100644
index 3df10d801e568834729f902aace483d033340e2d..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# FileManager设计文档
-## 目标
-在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
-
-主要功能包括：
-
-- 提供常用的命令行管理命令管理文件和目录
-- 支持大文件的断点上传、下载  
-
-## 名词解释
-- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
-- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
-- Chunk：逻辑划上文件分块的单位。
-
-## 模块
-### 架构图
-<image src=./src/filemanager.png width=900>
-
-### PFSClient
-- 功能： 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
-
-- 双向验证   
-	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
-		
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
-- 功能：  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
-	
-- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-
-### PFSServer
-PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
-
-RESTful API
-
-- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
-
-- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
-
-- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
-
-- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-
-## 文件传输优化
-
-### 分块文件传输
-用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-
-一个典型的Chunk如下所示：
-
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
-
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
-
-## 参考文档
-- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
-- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
-- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
deleted file mode 100644
index 56bc70c54bbc92b78d66e04fb495b1300cf8ebe0..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# PFSClient
-
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-
-## Synopsis
-```
-paddle [options] pfs <subcommand> [parameters]
-```
-
-## Options
-```
---profile (string)
-	Use a specific profile from your credential file.
-
---help (string)
-	Display more information about command
-
---version
-	Output version information and exit
-
---debug
-	Show detailed debugging log	
-	
---only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-
-## Subcommonds
-- rm - remove files or directories
-
-```
-Synopsis:
-	rm [-r] [-v] <PFSPath> ...
-
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-	
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
-- mv - move (rename) files
-
-```
-Synopsis:
-	mv [-f | -n] [-v] <LocalPath> <PFSPath>
-	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
-	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
-	
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-		
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
-- cp - copy files or directories
-
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
-
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-	   
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
-- ls- list files
-
-```
-Synopsis:
-	ls [-r] <PFSPath> ...
-	
-Options:
-	-R
-   		List directory(ies) recursively
-
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
-
-- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-
-```
-Synopsis:
-	mkdir <PFSPath> ...
-
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
deleted file mode 100644
index 7861a33072bc1908f69d12b37c20491dd8663103..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.graffle and /dev/null differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
deleted file mode 100644
index 8139a19f5722f56d3c211f3ab0d3982f751134b9..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.png and /dev/null differ
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index f627437f354a12c79cad25c959409db29ecbd874..b123b756e2251c38f319e1aefa2cb04fd7a36b03 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -9,5 +9,5 @@
   use_eigen_cn.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_cn.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 0b65fed67ad45eb399b624184485a99a082d79e9..98988fc22dcedecdbcd67fb3bf761377bf046337 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -9,5 +9,5 @@ Development
   use_eigen_en.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_en.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process_cn.md
similarity index 74%
rename from doc/fluid/dev/releasing_process.md
rename to doc/fluid/dev/releasing_process_cn.md
index c5943ccd81c2ae2aaacd2676da12509db889f54a..4c6728fba7150b0f1e180e57590f18a5b677c70d 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本，遵循以下流程:
   * 使用Regression Test List作为检查列表，测试本次release的正确性。
 	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
-		* 上传方法：
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
 
 需要注意的是:
 
@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 
 ## 发布wheel包到pypi
 
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
-<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 
 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
   发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可：
 
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 
 ## PaddlePaddle 分支规范
 
@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 ### PaddlePaddle Book中所有章节
 
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
 
 <table>
 <thead>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,210 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index bf1a705ef50b663efa53393ead1f81fd6bcf8c48..89b5c6847f15b3f2a270fe1e7db9e590549e8982 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
             scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 17e38b1cf042657834b4d0d1c12cbbb92f19fa45..194df3e4a8b50700e2be01ce5ebca83b92501fb8 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -39,6 +38,7 @@ Scope::~Scope() {
 }
 
 Scope& Scope::NewScope() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }
 
 void Scope::DeleteScope(Scope* scope) {
+  std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
   }
 }
 
-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c1e1f49caaa5a60df0e97289aada465b45213971..97a15c71773051dfc01c98f11cf9cb76adbcec7f 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,7 +52,7 @@ class Scope {
   /// Create a variable with a scope-unique name.
   Variable* Var(std::string* name = nullptr);
 
-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
@@ -88,6 +89,9 @@ class Scope {
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
+
+ private:
+  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
index 3adeeda90645ca983d9d9229b4cc1c4c90302206..719a7465b8d58ef8588ff1e83c2b971eb6fbb00f 100644
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
       cares zlib protobuf sendrecvop_grpc)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
 endif()
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index ef987d07f08525bff5267cdc2076ae767417e4f1..8bbfd1f15925992efdeaaffbbe7b350ffbcee889 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -138,7 +138,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
     auto* var = p_scope->FindVar(in_var_name_val);
 
     ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
 
     // var handle
     VarHandle var_h;
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index 2e7bf1921a26fc88d854e4db2c501548695a136a..d5fc163bc25409e0607b149b61c6266b38119d9d 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase {
                            framework::Scope* scope,
                            const platform::DeviceContext* dev_ctx,
                            framework::Executor* executor,
-                           framework::ProgramDesc* program, int blkid)
+                           framework::ProgramDesc* program,
+                           framework::ExecutorPrepareContext* prefetch_ctx)
       : RequestBase(service, cq, dev_ctx),
         responder_(&ctx_),
         scope_(scope),
         executor_(executor),
         program_(program),
-        blkid_(blkid) {
+        prefetch_ctx_(prefetch_ctx) {
+    request_.reset(new VariableResponse(scope, dev_ctx_));
     int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
-                                cq_, this);
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
+                                cq_, cq_, this);
   }
 
   virtual ~RequestPrefetch() {}
 
-  virtual std::string GetReqName() { return request_.varname(); }
+  virtual std::string GetReqName() { return request_->Varname(); }
 
   virtual void Process() {
     // prefetch process...
     ::grpc::ByteBuffer reply;
-    // TODO(Yancey1989): execute the Block which containers prefetch ops
 
-    VLOG(3) << "RequestPrefetch Process in";
+    std::string var_name = request_->OutVarname();
+    auto var_desc = program_->Block(0).FindVar(var_name);
+    framework::Scope* local_scope = &scope_->NewScope();
+    auto* var = local_scope->FindVar(var_name);
+    InitializeVariable(var, var_desc->GetType());
+    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
+
+    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);
 
     responder_.Finish(reply, ::grpc::Status::OK, this);
     status_ = FINISH;
   }
 
  protected:
-  sendrecv::VariableMessage request_;
+  std::shared_ptr<VariableResponse> request_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
   framework::Scope* scope_;
   framework::Executor* executor_;
   framework::ProgramDesc* program_;
+  framework::ExecutorPrepareContext* prefetch_ctx_;
   int blkid_;
 };
 
@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
   }
   RequestPrefetch* prefetch =
       new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
-                          executor_, program_, prefetch_blk_id_);
+                          executor_, program_, prefetch_ctx_);
 
   VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
 }
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
index 380447f47c142bdc16e60f78c4b2d94235ec5060..b6110f92ed4f38a156e0c99ecfb399f3f47a169e 100644
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -63,6 +63,10 @@ class AsyncGRPCServer final {
 
   void SetExecutor(framework::Executor *executor) { executor_ = executor; }
 
+  void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
+    prefetch_ctx_ = prepared;
+  }
+
   int GetSelectedPort() { return selected_port_; }
 
   const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
@@ -111,6 +115,7 @@ class AsyncGRPCServer final {
   std::unique_ptr<std::thread> t_prefetch_;
 
   int prefetch_blk_id_;
+  framework::ExecutorPrepareContext *prefetch_ctx_;
   framework::ProgramDesc *program_;
   framework::Executor *executor_;
   int selected_port_;
diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc
index b89aed0157de8e95564015b3e7f42316a39537f5..c51933718f4ca78e87c77e007c485642000d247d 100644
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -20,43 +20,121 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"
 
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;
 
+USE_OP(lookup_table);
+
 std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;
 
+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
 void StartServer(const std::string& endpoint) {
   rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  auto prepared = exe.Prepare(program, block->ID());
+  InitTensorsOnServer(&scope, &place, 10);
+
+  rpc_service_->SetProgram(&program);
+  rpc_service_->SetPrefetchPreparedCtx(prepared.get());
+  rpc_service_->SetDevCtx(&ctx);
+  rpc_service_->SetScope(&scope);
+  rpc_service_->SetExecutor(&exe);
+
   rpc_service_->RunSyncUpdate();
 }
 
 TEST(PREFETCH, CPU) {
   // start up a server instance backend
-  // TODO(Yancey1989): Need to start a server with optimize blocks and
-  // prefetch blocks.
   std::thread server_thread(StartServer, "127.0.0.1:8889");
+  sleep(2);
   framework::Scope scope;
   platform::CPUPlace place;
   platform::CPUDeviceContext ctx(place);
   // create var on local scope
-  std::string in_var_name("in");
+  int64_t rows_numel = 5;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("ids");
   std::string out_var_name("out");
-  auto* in_var = scope.Var(in_var_name);
-  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
-  in_tensor->Resize({10, 10});
-  VLOG(3) << "before mutable_data";
-  in_tensor->mutable_data<int>(place);
 
-  scope.Var(out_var_name);
-
-  VLOG(3) << "before fetch";
   detail::RPCClient client;
   client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
                                out_var_name);
   client.Wait();
 
+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::SelectedRows>()->value();
+  auto ptr = value.mutable_data<float>(place);
+
   rpc_service_->ShutDown();
   server_thread.join();
   rpc_service_.reset(nullptr);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  }
 }
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
index fc12e82a7e6bd10262092d1ca367980df64e91c2..02bb2b9cebb87b83aa1cbef0c644f969b4d17284 100644
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,7 +21,7 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // Prefetch variable by Ids
+  // pre-fetch variable by given variable name and Ids
   rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }
 
@@ -67,6 +67,8 @@ message VariableMessage {
   bytes serialized = 8;
   // selected_rows data
   bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
 }
 
 message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index f8576d01b10f4c0fda4d12d371b2966739acfc21..1577111a9628350b0cf3f01f2cf15f8c27994673 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -30,7 +30,8 @@ namespace detail {
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg) {
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
   using VarMsg = sendrecv::VariableMessage;
   sendrecv::VariableMessage request;
   std::string header;
@@ -52,6 +53,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
     e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
   }
 
+  if (!out_name.empty()) {
+    e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
+  }
   switch (framework::ToVarType(var->Type())) {
     case framework::proto::VarType_Type_LOD_TENSOR: {
       auto tensor = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
index d7954440846b8db9a9add0110fb9a546a762774d..c72e1bd076f670458f3915072154847db6205092 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*);
 
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                            const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg);
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
 
 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                                const platform::DeviceContext& ctx,
diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc
index 78e1d274a92241b5f2093beb63acdc8c497dfb83..c9d7fd6d1581f6f4182e9e3e0d633c13a3c336a5 100644
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
         }
         break;
       }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
 
       default: {
         // Unknown tag, return unknown error.
diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h
index 050b6b84010b4f3e95bc88e5bb738ff18b7fe423..93b0d3cfb4f7d7f336414361773f872d7b259482 100644
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -55,6 +55,7 @@ class VariableResponse {
   int Parse(const ::grpc::ByteBuffer& byte_buffer);
 
   inline std::string Varname() { return meta_.varname(); }
+  inline std::string OutVarname() { return meta_.out_varname(); }
 
   // should call parse first.
   framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 277f2856c07b3fec2113486539aec1d9139fae92..04c5872bef4600e30ba572a025cc5f0a5e9839ca 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
     out->Reset(
         new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
   }
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index 96c0c1cbe6d588364416925a7ab1bc8f90ac6fd7..ed868786ab2a80efa42574ed4f579c633ce0becf 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
 
     auto place_str = Attr<std::string>("place");
     platform::Place place;
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index 47d9989bc8748840ec2d39587fde24355d90b6b4..b72ccc77a3e1ec30fd817471d3ffd667974ae684 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    auto& out = detail::Ref(scope.FindVar(Output("Out")));
     int pass_num = Attr<int>("pass_num");
-    out.GetMutable<framework::ReaderHolder>()->Reset(
-        new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
   }
 };
 
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 3a1f3805a0483c2f5eabdc7432556051d8308964..b164ce232d6bea7b4ff0c67ee0a7dd83b14f61a2 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    auto& var = detail::Ref(scope.FindVar(Output("Out")));
-    var.GetMutable<framework::ReaderHolder>()->Reset(
+    out->Reset(
         new ShuffleReader(underlying_reader.Get(),
                           static_cast<size_t>(Attr<int>("buffer_size"))));
   }
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 3b8d192b6c6356d3ab21868d3d74957700b4cdc1..a41018d350e89881888d5e31089c2b9ecd76f6c0 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
@@ -35,18 +35,18 @@ extern void *cublas_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    inline cublasStatus_t operator()(Args... args) {                \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
-      std::call_once(cublas_dso_flag,                               \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                           \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    inline cublasStatus_t operator()(Args... args) {                         \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                         \
+      std::call_once(cublas_dso_flag, []() {                                 \
+        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index c65b060ab46cfcd38292be66dd5f2123f88bae63..f3cd3b2bbedef7c9140c2acddea0732972ff7fa0 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 
 #ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
   return cudnn_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 49a54d8478e9a4e507d31a67b924802def356bfa..24475b62ca2825c45ff7edb39328dece3b822b25 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -30,19 +30,19 @@ extern bool HasCUDNN();
 #ifdef PADDLE_USE_DSO
 
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
-                     &cudnn_dso_handle);                           \
-      EnforceCUDNNLoaded(#__name);                                 \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);           \
+      std::call_once(cudnn_dso_flag, []() {                                \
+        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
+      });                                                                  \
+      EnforceCUDNNLoaded(#__name);                                         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
   extern struct DynLoad__##__name __name
 
 #else
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index c1bf88f8cb690861b97686d99d36410143445243..d0d676b9d8ac462900b48246bec43166d04ef97b 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #ifdef PADDLE_WITH_CUPTI
+
 #include <cuda.h>
 #include <cupti.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -36,18 +37,18 @@ extern void *cupti_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
-      std::call_once(cupti_dso_flag,                               \
-                     paddle::platform::dynload::GetCUPTIDsoHandle, \
-                     &cupti_dso_handle);                           \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);                  \
+      std::call_once(cupti_dso_flag, []() {                                \
+        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
+      });                                                                  \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
+    }                                                                      \
+  };                                                                       \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1b3ff962d6edceb37deb94cc7daead7346d25352..4697fb6cd96770127206bdabeea77e43eb09d1f5 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <curand.h>
 #include <dlfcn.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -25,18 +26,18 @@ namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(Args... args) {                       \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
-      std::call_once(curand_dso_flag,                               \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                           \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    curandStatus_t operator()(Args... args) {                                \
+      typedef curandStatus_t (*curandFunc)(Args...);                         \
+      std::call_once(curand_dso_flag, []() {                                 \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e590e81bab51fd9fe12309335522614263d8e21d..3c1ccc7445ed27c711ab250aa223c66ae0da45dc 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
 #include <dlfcn.h>
+
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
 #if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+    if (nullptr == dso_handle) {
       if (dso_path == "libcudnn.dylib") {
         LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
                         "For instance, sudo tar -xzf "
@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     }
   }
 #endif
+
+  return dso_handle;
 }
 
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle,
-                                              bool throw_on_error = true) {
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
+                                               const std::string& dso_name,
+                                               bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
+  void* dso_handle = nullptr;
 
   std::string dlPath = dso_name;
   if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
   } else {
     // search xxx.so from custom path
     dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
       dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
   }
   auto error_msg =
@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       "using the DYLD_LIBRARY_PATH is impossible unless System "
       "Integrity Protection (SIP) is disabled.";
   if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
+    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == dso_handle) {
     LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
   }
+
+  return dso_handle;
 }
 
-void GetCublasDsoHandle(void** dso_handle) {
+void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
 
-void GetCUDNNDsoHandle(void** dso_handle) {
+void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
-                             false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
 }
 
-void GetCUPTIDsoHandle(void** dso_handle) {
+void* GetCUPTIDsoHandle() {
   std::string cupti_path = cupti_lib_path;
   if (!FLAGS_cupti_dir.empty()) {
     cupti_path = FLAGS_cupti_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
 #endif
 }
 
-void GetCurandDsoHandle(void** dso_handle) {
+void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
 }
 
-void GetWarpCTCDsoHandle(void** dso_handle) {
+void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
 }
 
-void GetLapackDsoHandle(void** dso_handle) {
+void* GetLapackDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
 #endif
 }
 
-void GetNCCLDsoHandle(void** dso_handle) {
+void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index b5b9c4af916241c1c7361b506f74563ebcf69b9a..4c85093a43e0e8d75b64c5b29d1ec68db1b44909 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,55 +18,13 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCUDNNDsoHandle(void** dso_handle);
-
-void GetCUPTIDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of NVIDIA nccl
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetNCCLDsoHandle(void** dso_handle);
+void* GetCublasDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetLapackDsoHandle();
+void* GetNCCLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 3edc70c46d03ddcc751e865676928c47fcb48e69..2c40c48ee08497f9a2a414687b9c51d87ba574aa 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,11 +25,6 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index dc78bcb44d3316a1ecee0c8d70dcb4777a9e2de4..d21e29df3cf9b2d78920d8bac41209d200b5ba3a 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <dlfcn.h>
 #include <nccl.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/call_once.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
@@ -28,18 +29,19 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
 
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);          \
+      std::call_once(nccl_dso_flag, []() {                               \
+        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
+      });                                                                \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index f5ded0eb6b1107c886641e848f5040a7a2d806a5..7fa468370463a51c486b80317f401612930bc72e 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,9 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <dlfcn.h>
-#include <mutex>
-#include "ctc.h"
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "warpctc/include/ctc.h"
 
 namespace paddle {
 namespace platform {
@@ -31,18 +32,18 @@ extern void* warpctc_dso_handle;
  * (for each function) to dynamic load warpctc routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
-      std::call_once(warpctc_dso_flag,                               \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
-                     &warpctc_dso_handle);                           \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
-    }                                                                \
-  };                                                                 \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(warpctc_dso_flag, []() {                                  \
+        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
+      });                                                                      \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
+    }                                                                          \
+  };                                                                           \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 57962e6795cf531efcbf72e9e30e94d3013a27b8..36345e17406e22970806fa274d5a73a703517c43 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,9 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Environment variable: fraction of GPU memory to use on each device.
-const char kEnvFractionGpuMemoryToUse[] = "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
-
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index fbe953b2d8f12ca529f3daa01cc8e2fe8875a416..4a9dbd324c90380e784cc9457845fabd858585be 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <Python.h>
 #include <string>
 #include <tuple>
 #include <vector>
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py
index 7b4afa9bf65e1369329cd4648c1f5c4bd8fa8357..1c56064a1e8bdc5d975837cb5a75a40d557765ad 100644
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
@@ -16,6 +16,7 @@ import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
+from google.protobuf import text_format
 
 _vartype2str_ = [
     "UNK",
@@ -100,7 +101,7 @@ def repr_var(vardesc):
 
 def pprint_program_codes(program_desc):
     reprs = []
-    for block_idx in range(program_desc.num_blocks()):
+    for block_idx in range(program_desc.desc.num_blocks()):
         block_desc = program_desc.block(block_idx)
         block_repr = pprint_block_codes(block_desc)
         reprs.append(block_repr)
@@ -127,7 +128,7 @@ def pprint_block_codes(block_desc, show_backward=False):
 
     if type(block_desc) is not framework_pb2.BlockDesc:
         block_desc = framework_pb2.BlockDesc.FromString(
-            block_desc.serialize_to_string())
+            block_desc.desc.serialize_to_string())
     var_reprs = []
     op_reprs = []
     for var in block_desc.vars:
@@ -237,13 +238,13 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     # draw parameters and args
     vars = {}
     for var in desc.vars:
-        shape = [str(i) for i in var.lod_tensor.tensor.dims]
-        if not shape:
-            shape = ['null']
+        # TODO(gongwb): format the var.type
         # create var
         if var.persistable:
             varn = graph.add_param(
-                var.name, var.type, shape, highlight=need_highlight(var.name))
+                var.name,
+                str(var.type).replace("\n", "<br />", 1),
+                highlight=need_highlight(var.name))
         else:
             varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
         vars[var.name] = varn
@@ -268,4 +269,4 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
         for var in op.outputs:
             add_op_link_var(opn, var, True)
 
-    graph(path, show=True)
+    graph(path, show=False)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 39d4017861f4d2ac2e8e85c3d70440a43e6cdc71..fbe4531f056a583d2b4d855d32fe8e04da94fa3c 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -640,6 +640,20 @@ class Operator(object):
         """
         return self.desc.block_attr(name)
 
+    def all_attrs(self):
+        """
+        Get the attribute dict
+        Returns(dict): The Operator's attribute dict
+        """
+        attr_names = self.attr_names
+        attr_map = {}
+        for n in attr_names:
+            if n == 'sub_block':
+                attr_map[n] = self.block_attr(n)
+            else:
+                attr_map[n] = self.attr(n)
+        return attr_map
+
 
 class Block(object):
     def __init__(self, program, idx):
@@ -838,7 +852,7 @@ class Block(object):
 
     def sync_with_cpp(self):
         """
-        Sync with the desc on the c++ end.
+        Sync from the desc on the c++ end.
 
         This method is used to synchronize the c++ desc instance generated by backward.
         """
@@ -951,6 +965,13 @@ class Block(object):
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
             ret_var = self.create_var(
                 name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=True)
         else:
             ret_var = self.create_var(
                 name=var.name,
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index b8d21344fc8f65f4025f28a195dab2d371b30292..125b4efa9d476e561bd78d0365cd92bbf7e66605 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -83,7 +83,7 @@ class Graph(object):
         file = open(dot_path, 'w')
         file.write(self.__str__())
         image_path = os.path.join(
-            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
         cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
         subprocess.Popen(
             cmd,
@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
         else:
             self.graph.show(path)
 
-    def add_param(self, name, data_type, shape, highlight=False):
+    def add_param(self, name, data_type, highlight=False):
         label = '\n'.join([
             '<<table cellpadding="5">',
             '  <tr>',
@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
             str(data_type),
             '    </td>'
             '  </tr>',
-            '  <tr>',
-            '    <td>',
-            '[%s]' % 'x'.join(shape),
-            '    </td>'
-            '  </tr>',
             '</table>>',
         ])
         return self.graph.node(
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index bd7e9c30fed2c38a206bf17a646d8a4433af4099..969398bda4cfd0b2f5e39f45d34a1da9b216901f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var):
     new_var.desc.set_shapes(var.desc.shapes())
     new_var.desc.set_dtypes(var.desc.dtypes())
     new_var.persistable = True
-    return monkey_patch_reader_methods(new_var)
+    return new_var
+
+
+def _copy_reader_create_op_(block, op):
+    input_param_names = op.input_names
+    new_input_map = {}
+    for param_name in input_param_names:
+        new_input_map[param_name] = []
+        arg_names = op.input(param_name)
+        for arg_name in arg_names:
+            new_input_map[param_name].append(block.var(arg_name))
+
+    output_param_names = op.output_names
+    new_output_map = {}
+    for param_name in output_param_names:
+        new_output_map[param_name] = []
+        arg_names = op.output(param_name)
+        for arg_name in arg_names:
+            new_output_map[param_name].append(block.var(arg_name))
+
+    new_op = block.append_op(
+        type=op.type,
+        inputs=new_input_map,
+        outputs=new_output_map,
+        attrs=op.all_attrs())
+    return new_op
 
 
 def open_recordio_file(filename, shapes, lod_levels, dtypes):
@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
@@ -313,22 +339,25 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def __create_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
+    startop_op = startup_blk.append_op(
         type=op_type,
         inputs={'UnderlyingReader': reader},
         outputs={'Out': [startup_var]},
         attrs=attrs)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_block = default_main_program().current_block()
+    main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
+    _copy_reader_create_op_(main_prog_block, startop_op)
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def create_shuffle_reader(reader, buffer_size):
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 2b7bbf9218f9b8fd8f5b29ac3cbc2f9680f471eb..67b03f635b6f8a3003efabe5425325080d47f61c 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p.desc))
+        print(debuger.pprint_program_codes(p))
+
+        debuger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 640264d82f0dc7fa71bf882d5549e30b87b8d7c5..24a0074d9b9621d902d12eb8cb29d9b65be22ed3 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -15,8 +15,8 @@
 import unittest
 
 import paddle.fluid as fluid
-import paddle
-import paddle.dataset.mnist as mnist
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
diff --git a/python/setup.py.in b/python/setup.py.in
index 2707d34a2ab327ab4282aa7473d78a3f5c08e890..5e7096e225e08d19e89051603bbc07eff945c78a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -107,6 +107,7 @@ package_dir={
     # So that package points to other directory.
     'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
     package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'

	Linear Regression	Recognize Digits	Image Classification	Word2Vec	Personalized Recommendation	Sentiment Analysis	Semantic Role Labeling	Machine Translation
API.V2 + Docker + GPU
API.V2 + Docker + CPU
`paddle_trainer` + Docker + GPU
`paddle_trainer` + Docker + CPU
API.V2 + Ubuntu + GPU
API.V2 + Ubuntu + CPU
`paddle_trainer` + Ubuntu + GPU
`paddle_trainer` + Ubuntu + CPU