Merge branch 'develop' into core_inference_fix_run

a9855e4a · Liu Yiqun · fecbf7ff · b1a5a3ca · a9855e4a · a9855e4a
184 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/

 # clion workspace.
 cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")

-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
 add_dependencies(snappystream extern_snappystream)
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )

 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.

 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)

-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.

 ExternalProject_Add(
    extern_zlib

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -244,14 +244,14 @@ function(cc_test TARGET_NAME)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction(cc_test)

@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@@ -387,8 +387,8 @@ function(hip_test TARGET_NAME)
    endif()
    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
-    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction()


--- a/doc/design/file_manager/README.md
+++ b/doc/design/file_manager/README.md
-# FileManager设计文档
-## 目标
-在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
-
-主要功能包括：
-
- 提供常用的命令行管理命令管理文件和目录
- 支持大文件的断点上传、下载  
-
-## 名词解释
- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
- Chunk：逻辑划上文件分块的单位。
-
-## 模块
-### 架构图
-<image src=./src/filemanager.png width=900>
-
-### PFSClient
- 功能： 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
-
- 双向验证   
-	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
-		
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
- 功能：  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
-	
- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-
-### PFSServer
-PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
-
-RESTful API
-
- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
-
- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
-
- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
-
- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-
-## 文件传输优化
-
-### 分块文件传输
-用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-
-一个典型的Chunk如下所示：
-
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
-
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
-
-## 参考文档
- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
- [linux man document](https://linux.die.net/man/)
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ b/doc/design/file_manager/pfs/pfsclient.md
-# PFSClient
-
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-
-## Synopsis
-```
-paddle [options] pfs <subcommand> [parameters]
-```
-
-## Options
-```
--profile (string)
-	Use a specific profile from your credential file.
-
--help (string)
-	Display more information about command
-
--version
-	Output version information and exit
-
--debug
-	Show detailed debugging log	
-	
--only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-
-## Subcommonds
- rm - remove files or directories
-
-```
-Synopsis:
-	rm [-r] [-v] <PFSPath> ...
-
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-	
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
- mv - move (rename) files
-
-```
-Synopsis:
-	mv [-f | -n] [-v] <LocalPath> <PFSPath>
-	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
-	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
-	
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-		
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
- cp - copy files or directories
-
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
-
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-	   
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
- ls- list files
-
-```
-Synopsis:
-	ls [-r] <PFSPath> ...
-	
-Options:
-	-R
-   		List directory(ies) recursively
-
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
-
- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-
-```
-Synopsis:
-	mkdir <PFSPath> ...
-
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
--- a/doc/design/file_manager/src/filemanager.graffle
+++ b/doc/design/file_manager/src/filemanager.graffle
--- a/doc/design/file_manager/src/filemanager.png
+++ b/doc/design/file_manager/src/filemanager.png
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)

 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})

-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)

 add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -9,5 +9,5 @@
  use_eigen_cn.md
  name_convention.md
  support_new_device.md
-  releasing_process.md
+  releasing_process_cn.md
  op_markdown_format.md
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -9,5 +9,5 @@ Development
  use_eigen_en.md
  name_convention.md
  support_new_device.md
-  releasing_process.md
+  releasing_process_en.md
  op_markdown_format.md
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本，遵循以下流程:
  * 使用Regression Test List作为检查列表，测试本次release的正确性。
 	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
-		* 上传方法：
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。

 需要注意的是:

@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本，遵循以下流程:

 ## 发布wheel包到pypi

-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
-<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```

 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可：

-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。

 ## PaddlePaddle 分支规范

@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-

 ### PaddlePaddle Book中所有章节

-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。

 <table>
 <thead>

--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)

 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})

-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)

 add_subdirectory(api)
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})

-add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
    ${START_END}
 )

-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
    DEPENDS _swig_paddle
 )

 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)

 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)

--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)

--- a/paddle/fluid/framework/.clang-format
+++ b/paddle/fluid/framework/.clang-format
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -7,9 +7,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)

 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place memory device_context framework_proto)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place memory device_context framework_proto)
 endif()

 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -21,9 +21,9 @@ endif()

 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)

 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})


--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>

@@ -96,6 +97,8 @@ class BlockDesc {
   */
  void RemoveOp(size_t s, size_t e);

+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
  std::vector<OpDesc *> AllOps() const;

  size_t OpSize() const { return ops_.size(); }

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#include <stddef.h>  // for size_t
-#include <condition_variable>
+#include <stddef.h>            // for size_t
+#include <condition_variable>  // NOLINT
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"

@@ -216,7 +216,8 @@ class ChannelHolder {

  template <typename T>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
      channel_.reset(MakeChannel<T>(buffer_size));
    }


--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>  // for size_t
 #include <atomic>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  virtual void Unlock();
  virtual bool IsClosed();
  virtual void Close();
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
  virtual ~ChannelImpl();

  virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
    const void *referrer;  // TODO(thuan): figure out better way to do this
    std::function<bool(ChannelAction)> callback;

-    QueueMessage(T *item)
+    explicit QueueMessage(T *item)
        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}

    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  }

  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
-    while (!queue.empty()) {
+      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
+    while (!queue->empty()) {
      // Check whether this message was added by Select
      // If this was added by Select then execute the callback
      // to check if you can execute this message. The callback
      // can return false if some other case was executed in Select.
      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue.front();
-      queue.pop_front();
+      std::shared_ptr<QueueMessage> m = queue->front();
+      queue->pop_front();
      if (m->callback == nullptr || m->callback(action)) return m;
    }
    return nullptr;
@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) {
  // to send to the receiver, bypassing the channel buffer if any
  if (!recvq.empty()) {
    std::shared_ptr<QueueMessage> m =
-        get_first_message(recvq, ChannelAction::SEND);
+        get_first_message(&recvq, ChannelAction::SEND);

    if (m != nullptr) {
      *(m->data) = std::move(*item);
@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) {
  // buffer and move front of send queue to the buffer
  if (!sendq.empty()) {
    std::shared_ptr<QueueMessage> m =
-        get_first_message(sendq, ChannelAction::RECEIVE);
+        get_first_message(&sendq, ChannelAction::RECEIVE);
    if (buf_.size() > 0) {
      // Case 1 : Channel is Buffered
      // Do Data transfer from front of buffer
@@ -219,8 +219,9 @@ bool ChannelImpl<T>::Receive(T *item) {
      if (m != nullptr) {
        *item = std::move(*(m->data));
        m->Notify();
-      } else
+      } else {
        return recv_return(Receive(item));
+      }
    }
    return recv_return(true);
  }

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/framework/channel.h"

-#include <chrono>
-#include <thread>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"

 using paddle::framework::Channel;
@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  std::thread t([&]() {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
+      if (i < buffer_size) {
        ch->Send(&i);  // should block after 10 iterations
-      else {
+      } else {
        bool is_exception = false;
        try {
          ch->Send(&i);
@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
 }

 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];

  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec

  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }

@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];

  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (!thread_ended[i]) ct++;
    }
    EXPECT_GE(ct, 4);
  } else {
    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (send_success[i]) ct++;
    }
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 // This tests that closing a buffered channel also unblocks
@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];

  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (thread_ended[i] == false) ct++;
    }
    // Atleast 4 threads must be blocked
    EXPECT_GE(ct, 4);
  } else {
    // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

  // Count number of successful sends
  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    if (send_success[i]) ct++;
  }

@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  }

  // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];

  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait

  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // delete the channel
  delete ch;
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }

 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];

  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec

  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }

@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];

  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (!thread_ended[i]) ct++;
    }
    EXPECT_GE(ct, 4);
  } else {
    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (send_success[i]) ct++;
    }
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 // This tests that closing a channelholder unblocks
@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];

  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (thread_ended[i] == false) ct++;
    }
    // Atleast 4 threads must be blocked
    EXPECT_GE(ct, 4);
  } else {
    // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait

  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

  // Count number of successfuld sends
  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    if (send_success[i]) ct++;
  }

@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  }

  // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];

  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait

  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // delete the channel
  delete ch;
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }

-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {

 // This tests that closing a channelholder many times.
 void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int num_threads = 15;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const int kNumThreads = 15;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];

  // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < num_threads / 3; i++) {
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *ended) {
@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  }

  // Launches threads that try to receive data to channel.
-  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  }

  // Launches threads that try to close the channel.
-  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait

  // Verify that all threads are unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  EXPECT_TRUE(ch->IsClosed());
  // delete the channel
  delete ch;
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }

 TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
    return (lod_)[level].size() - 1;
  }

+  // Split LoDTensor and copy to each place specified in places.
  std::vector<LoDTensor> SplitLoDTensor(
      const std::vector<platform::Place> places) const;


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };

+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+  if (var->IsType<framework::LoDTensor>()) {
+    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return framework::ToDataType(
+        var->Get<framework::SelectedRows>().value().type());
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+}
+
 static DDim GetDims(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;
 }

+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+
 class OperatorBase;
 class ExecutionContext;


--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/platform/profiler.h"

 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };

+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
+
 ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
-    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    const std::unordered_set<std::string> &bcast_vars,
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

-  // Step 1. RunStartupProgram and Bcast the params to devs.
-  Executor exe(places[0]);
-  exe.Run(startup_program, scope, 0);
+  // Step 1. Bcast the params to devs.
  // Create local scopes
-  for (size_t i = 0; i < member_->places_.size(); ++i) {
-    member_->local_scopes_.push_back(&scope->NewScope());
+  if (local_scopes.empty()) {
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(&scope->NewScope());
+    }
+  } else {
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(local_scopes[i]);
+    }
  }

 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
+      local_scopes.empty()) {  // Is CUDA
+    BCastParamsToGPUs(bcast_vars);
  }
 // Startup Program has been run. All local scopes has correct parameters.

@@ -99,48 +109,47 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];

-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
-    size_t idx = var_desc->Name().find("@GRAD");
-    if (idx != std::string::npos) continue;
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
-      auto &main_tensor =
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
-
-      auto &dims = main_tensor.dims();
-
-      if (paddle::platform::is_gpu_place(main_tensor.place())) {
-        size_t numel = main_tensor.numel();
-        ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-        platform::NCCLGroupGuard guard;
-        for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto place = member_->places_[i];
-          void *buffer;
-          if (i == 0) {
-            buffer = const_cast<void *>(main_tensor.data<void>());
-          } else {
-            auto local_scope = member_->local_scopes_[i];
-            auto *t =
-                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
-            t->Resize(dims);
-            buffer = t->mutable_data(place, main_tensor.type());
-          }
-          auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-          platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
-        }
-      } else {
-        platform::CPUPlace cpu;
-        for (size_t i = 1; i < member_->places_.size(); ++i) {
+  for (auto &var : vars) {
+    auto *main_var = main_scope->FindVar(var);
+    if (!main_var->IsType<LoDTensor>()) {
+      continue;
+    }
+
+    auto &main_tensor = main_var->Get<LoDTensor>();
+
+    auto &dims = main_tensor.dims();
+
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      platform::NCCLGroupGuard guard;
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
+        if (i == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
          t->Resize(dims);
-          t->mutable_data(cpu, main_tensor.type());
-          paddle::framework::TensorCopy(main_tensor, cpu, t);
+          buffer = t->mutable_data(place, main_tensor.type());
        }
+        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
+        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
+                                     nccl_ctx.comm_, nccl_ctx.stream());
+      }
+    } else {
+      platform::CPUPlace cpu;
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
+        auto local_scope = member_->local_scopes_[i];
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+        t->Resize(dims);
+        t->mutable_data(cpu, main_tensor.type());
+        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
    member_->nccl_ctxs_->WaitAll();
@@ -150,13 +159,30 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }

-void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
-                           const std::string &fetched_var_name) {
+void ParallelExecutor::Run(
+    const std::vector<std::string> &fetch_tensors,
+    const std::string &fetched_var_name,
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
+  SplitTensorToPlaces(feed_tensors);
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
 }

+void ParallelExecutor::SplitTensorToPlaces(
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+  for (auto it : feed_tensors) {
+    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
+      // TODO(panxy0718): Do I need to delete this var?
+      member_->local_scopes_[j]
+          ->Var(it.first)
+          ->GetMutable<LoDTensor>()
+          ->ShareDataWith(lod_tensors[j]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -36,18 +36,25 @@ class ParallelExecutor {
  explicit ParallelExecutor(size_t num_threads, bool use_event,
                            const std::vector<platform::Place>& places,
                            const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
+                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);

+  std::vector<Scope*>& GetLocalScopes();
+
  void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name = "fetched_var");
+           const std::string& fetched_var_name,
+           const std::unordered_map<std::string, LoDTensor>& feed_tensors);

 private:
+  void SplitTensorToPlaces(
+      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+
  ParallelExecutorPrivate* member_;

-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"

 #include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -39,6 +38,7 @@ Scope::~Scope() {
 }

 Scope& Scope::NewScope() const {
+  std::unique_lock<std::mutex> lock(mutex_);
  kids_.push_back(new Scope(this));
  return *kids_.back();
 }
@@ -92,6 +92,7 @@ std::vector<std::string> Scope::LocalVarNames() const {
 }

 void Scope::DeleteScope(Scope* scope) {
+  std::unique_lock<std::mutex> lock(mutex_);
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
  this->kids_.erase(it);
@@ -103,7 +104,7 @@ void Scope::DeleteScope(Scope* scope) {
  }
 }

-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
  std::set<std::string> var_set(var_names.begin(), var_names.end());
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <list>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -51,7 +52,7 @@ class Scope {
  /// Create a variable with a scope-unique name.
  Variable* Var(std::string* name = nullptr);

-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);

  /// Find a variable in the scope or any of its ancestors.  Returns
  /// nullptr if cannot find.
@@ -88,6 +89,9 @@ class Scope {
  Scope const* parent_{nullptr};

  DISABLE_COPY_AND_ASSIGN(Scope);
+
+ private:
+  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */

 namespace paddle {
 namespace framework {
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                       const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,6 +50,15 @@ class SelectedRows {

  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }

+  /**
+   * get the index of id in rows
+   */
+  int64_t index(int64_t id) const {
+    auto it = std::find(rows_.begin(), rows_.end(), id);
+    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
    dims[0] = height_;

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place)) {
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
    }
 #endif
    offset_ = 0;
@@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {

 inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
+                 "Cannot invoke mutable data if current hold nothing.");
  return mutable_data(place, holder_->type());
 }


--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
 public:
  using ElementVars = std::vector<ElementVar>;

-  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+  Tuple(const std::vector<ElementVar>& var,
+        const std::vector<VarDesc>& var_desc)
      : var_(var), var_desc_(var_desc) {}
-  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}

-  ElementVar get(int idx) const { return var_[idx]; };
+  ElementVar get(int idx) const { return var_[idx]; }

-  ElementVar& get(int idx) { return var_[idx]; };
+  ElementVar& get(int idx) { return var_[idx]; }

-  bool isSameType(Tuple& t) const;
+  bool isSameType(const Tuple& t) const;

-  size_t getSize() const { return var_.size(); };
+  size_t getSize() const { return var_.size(); }

 private:
  ElementVars var_;
  std::vector<VarDesc> var_desc_;
 };

-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
  size_t tuple_size = getSize();
  if (tuple_size != t.getSize()) {
    return false;

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)

 cc_library(paddle_fluid_api
    SRCS io.cc

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
  return false;
 }

-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
                      const std::string& param_filename) {
@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }

 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
-    framework::Scope& scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
+    framework::Executor& executor, framework::Scope& scope,
+    const std::string& prog_filename, const std::string& param_filename) {
  std::string model_filename = prog_filename;
  std::string program_desc_str;
  ReadBinaryFile(model_filename, program_desc_str);

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {

-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
                      const std::string& param_filename);

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
  set(multiValueArgs ARGS)
  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
  set(arg_list "")
  if(inference_test_ARGS)
    foreach(arg ${inference_test_ARGS})

--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
  // The second dim of the input tensor should be 13
  // The input data should be >= 0
  int64_t batch_size = 10;
-  SetupTensor<float>(
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
+                     static_cast<float>(10));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);


--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+                     static_cast<float>(0), static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

@@ -48,8 +46,8 @@ TEST(inference, image_classification) {

  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+                                            FLAGS_repeat);
  LOG(INFO) << output1.dims();

 #ifdef PADDLE_WITH_CUDA
@@ -59,8 +57,8 @@ TEST(inference, image_classification) {

  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+                                             FLAGS_repeat);
  LOG(INFO) << output2.dims();

  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
  int64_t predicate_dict_len = 3162;
  int64_t mark_dict_len = 2;

-  SetupLoDTensor(word,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(mark_dict_len - 1));

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;

--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 1, 28, 28},
-                     static_cast<float>(-1),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
+                     static_cast<float>(-1), static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {

    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+                                              FLAGS_repeat, is_combined);
    LOG(INFO) << output1.dims();

 #ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {

    // Run inference on CUDA GPU
    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(
-        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+                                               FLAGS_repeat, is_combined);
    LOG(INFO) << output2.dims();

    CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {

  // Use the first data from paddle.dataset.movielens.test() as input
  std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);

  std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);

  std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);

  std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);

  std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);

  std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);

  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&user_id);

--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
  paddle::framework::LoDTensor word_data, trg_word;
  paddle::framework::LoD lod{{0, 4, 10}};

-  SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
+  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&word_data);

--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
  paddle::framework::LoD lod{{0, 4, 10}};
  int64_t word_dict_len = 5147;

-  SetupLoDTensor(words,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;

--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"

 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
  paddle::framework::LoD lod{{0, 1}};
  int64_t dict_size = 2073;  // The size of dictionary

-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);

  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&first_word);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+
+#include <map>
+#include <random>
+#include <string>
+#include <vector>

-#include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"

 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, T lower, T upper) {
+  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input->numel(); ++i) {
+    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
  }
 }

 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 std::vector<T>& data) {
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, const std::vector<T>& data) {
  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }

 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
+                    const paddle::framework::LoD& lod, T lower, T upper) {
+  input->set_lod(lod);
  int dim = lod[0][lod[0].size() - 1];
  SetupTensor<T>(input, {dim, 1}, lower, upper);
 }

 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                    paddle::framework::DDim dims,
-                    paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const paddle::framework::LoD lod,
+                    const std::vector<T>& data) {
  const size_t level = lod.size() - 1;
  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
  SetupTensor<T>(input, dims, data);
 }

 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+void CheckError(const paddle::framework::LoDTensor& output1,
+                const paddle::framework::LoDTensor& output2) {
  // Check lod information
  EXPECT_EQ(output1.lod(), output2.lod());

@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1,
 template <typename Place>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1,
-                   const bool is_combined = false) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1, const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
@@ -132,11 +131,9 @@ void TestInference(const std::string& dirname,
      //  `fluid.io.save_inference_model`.
      std::string prog_filename = "__model_combined__";
      std::string param_filename = "__params_combined__";
-      inference_program =
-          paddle::inference::Load(executor,
-                                  *scope,
-                                  dirname + "/" + prog_filename,
-                                  dirname + "/" + param_filename);
+      inference_program = paddle::inference::Load(
+          executor, *scope, dirname + "/" + prog_filename,
+          dirname + "/" + param_filename);
    } else {
      // Parameters are saved in separate files sited in the specified
      // `dirname`.

--- a/paddle/fluid/memory/.clang-format
+++ b/paddle/fluid/memory/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
 add_subdirectory(detail)

-cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)

-cc_library(paddle_memory
+cc_library(memory
        DEPS
-        memory
-        memcpy
-        meta_data
-        meta_cache
-        memory_block
-        buddy_allocator
-        system_allocator)
+        malloc
+        memcpy)

-cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)

 #if (WITH_GPU)
-#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place paddle_memory)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
+cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
+
 if(${WITH_GPU})
  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
@@ -6,10 +8,4 @@ endif(${WITH_GPU})

 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)

-cc_library(meta_data SRCS meta_data.cc)
-
-cc_library(meta_cache SRCS meta_cache.cc)
-
-cc_library(memory_block SRCS memory_block.cc)
-
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {

 void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // adjust allocation alignment
-  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+  size_t size =
+      align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);

  // acquire the allocator lock
  std::lock_guard<std::mutex> lock(mutex_);
@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
    return;
  }

-  block->mark_as_free(cache_);
+  block->mark_as_free(&cache_);

  total_used_ -= block->total_size(cache_);
  total_free_ += block->total_size(cache_);
@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
                                   right_buddy));

      // merge its right buddy to the block
-      block->merge(cache_, right_buddy);
+      block->merge(&cache_, right_buddy);
    }
  }

@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
                                   left_buddy->total_size(cache_), left_buddy));

      // merge the block to its left buddy
-      left_buddy->merge(cache_, block);
+      left_buddy->merge(&cache_, block);
      block = left_buddy;
    }
  }
@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }

 void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, size);
+  void* p = system_allocator_->Alloc(&index, size);

  VLOG(10) << "Allocated " << p << " from system allocator.";

  if (p == nullptr) return nullptr;

-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
                                     size, nullptr, nullptr);

  return static_cast<MemoryBlock*>(p)->data();
@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {

  // Allocate a new maximum sized block
  size_t index = 0;
-  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);

  if (p == nullptr) return pool_.end();

  VLOG(10) << "Creating and inserting new block " << p
           << " from system allocator";

-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                     max_chunk_size_, nullptr, nullptr);

  // gpu fallback allocation
@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,

  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
           << ") into";
-  block->split(cache_, size);
+  block->split(&cache_, size);

  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
           << ")";
-  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);

  // the rest of memory if exist
  if (block->has_right_buddy(cache_)) {

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,18 +14,18 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include <mutex>  // NOLINT
+#include <set>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"

-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
 namespace paddle {
 namespace memory {
 namespace detail {

--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/assert.h"

 namespace paddle {
 namespace memory {
 namespace detail {

-void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
                       void* left_buddy, void* right_buddy) {
-  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
-                             static_cast<MemoryBlock*>(left_buddy),
-                             static_cast<MemoryBlock*>(right_buddy)));
+  cache->save(
+      this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
+                              static_cast<MemoryBlock*>(left_buddy),
+                              static_cast<MemoryBlock*>(right_buddy)));
 }

-MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
  return cache.load(this).type;
 }

-size_t MemoryBlock::size(MetadataCache& cache) const {
+size_t MemoryBlock::size(const MetadataCache& cache) const {
  return cache.load(this).size;
 }

-size_t MemoryBlock::total_size(MetadataCache& cache) const {
+size_t MemoryBlock::index(const MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+size_t MemoryBlock::total_size(const MetadataCache& cache) const {
  return cache.load(this).total_size;
 }

-MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
  return cache.load(this).left_buddy;
 }

-MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
  return cache.load(this).right_buddy;
 }

-void MemoryBlock::split(MetadataCache& cache, size_t size) {
+void MemoryBlock::split(MetadataCache* cache, size_t size) {
  // make sure the split fits
-  PADDLE_ASSERT(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(*cache) >= size);

  // bail out if there is no room for another partition
-  if (total_size(cache) - size <= sizeof(Metadata)) {
+  if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
    return;
  }

  // find the position of the split
  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;

-  size_t remaining_size = total_size(cache) - size;
+  size_t remaining_size = total_size(*cache) - size;

  // Add the new block as a buddy
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);

  // Write the metadata for the new block
  auto new_block_right_buddy = metadata.right_buddy;

-  cache.store(
-      static_cast<MemoryBlock*>(right_partition),
-      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
-               remaining_size, this, new_block_right_buddy));
+  cache->save(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlock::Desc(FREE_CHUNK, index(*cache),
+                                remaining_size - sizeof(MemoryBlock::Desc),
+                                remaining_size, this, new_block_right_buddy));

  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(Metadata);
+  metadata.size = size - sizeof(MemoryBlock::Desc);
  metadata.total_size = size;

-  cache.store(this, metadata);
+  cache->save(this, metadata);

  // Write metadata for the new block's right buddy
  if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(new_block_right_buddy);
+    auto buddy_metadata = cache->load(new_block_right_buddy);

    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);

-    cache.store(new_block_right_buddy, buddy_metadata);
+    cache->save(new_block_right_buddy, buddy_metadata);
  }
 }

-void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
  // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);

-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);

  // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(cache);
+  metadata.right_buddy = right_buddy->right_buddy(*cache);

  // link buddy's buddy -> this
  if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(metadata.right_buddy);
+    auto buddy_metadata = cache->load(metadata.right_buddy);

    buddy_metadata.left_buddy = this;

-    cache.store(metadata.right_buddy, buddy_metadata);
+    cache->save(metadata.right_buddy, buddy_metadata);
  }

-  metadata.size += right_buddy->total_size(cache);
-  metadata.total_size += right_buddy->total_size(cache);
+  metadata.size += right_buddy->total_size(*cache);
+  metadata.total_size += right_buddy->total_size(*cache);

-  cache.store(this, metadata);
-  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+  cache->save(this, metadata);
+  cache->save(right_buddy,
+              MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }

-void MemoryBlock::mark_as_free(MetadataCache& cache) {
+void MemoryBlock::mark_as_free(MetadataCache* cache) {
  // check for double free or corruption
-  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
-  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
-
+  PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
  set_type(cache, FREE_CHUNK);
 }

-void MemoryBlock::set_type(MetadataCache& cache, Type t) {
-  auto metadata = cache.load(this);
-
+void MemoryBlock::set_type(MetadataCache* cache, Type t) {
+  auto metadata = cache->load(this);
  metadata.type = t;
-
-  cache.store(this, metadata);
-}
-
-bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-
-bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-
-size_t MemoryBlock::index(MetadataCache& cache) const {
-  return cache.load(this).index;
+  cache->save(this, metadata);
 }

 void* MemoryBlock::data() const {
-  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+  return const_cast<MemoryBlock::Desc*>(
+             reinterpret_cast<const MemoryBlock::Desc*>(this)) +
+         1;
 }

 MemoryBlock* MemoryBlock::metadata() const {
  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const Metadata*>(this) - 1));
+      reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
 }

 }  // namespace detail

--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once

-#include <cstddef>
+#include <cstdint>
+#include <unordered_map>

 namespace paddle {
 namespace memory {
 namespace detail {

-// Forward Declarations
+// Forward declaration.
 class MetadataCache;

-/*! \brief A class used to interpret the contents of a memory block */
-class MemoryBlock {
- public:
+// MemoryBlock represents Each allocated memory block, which contains
+// MemoryBlock::Desc and the payload.
+struct MemoryBlock {
  enum Type {
    FREE_CHUNK,    // memory is free and idle
    ARENA_CHUNK,   // memory is being occupied
@@ -33,57 +33,96 @@ class MemoryBlock {
    INVALID_CHUNK  // memory is invalid
  };

- public:
-  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
+  // If it is a CPU memory block, the MetadataCache writes the
+  // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
+  // block, the MetadataCache writes the Meatadata to a std::map in
+  // the CPU.
+  void init(MetadataCache* cache, Type t, size_t index, size_t size,
            void* left_buddy, void* right_buddy);

- public:
-  /*! \brief The type of the allocation */
-  Type type(MetadataCache& cache) const;
-
-  /*! \brief The size of the data region */
-  size_t size(MetadataCache& cache) const;
+  // All these accessors returns fields in the MemoryBlock::Desc of the memory
+  // block.  They all need a MetadataCache instance as their first
+  // parameter because they read the MemoryBlock::Desc from the cache.
+  Type type(const MetadataCache& cache) const;
+  size_t size(const MetadataCache& cache) const;
+  size_t index(const MetadataCache& cache) const;
+  size_t total_size(const MetadataCache& cache) const;
+  bool has_left_buddy(const MetadataCache& cache) const;
+  bool has_right_buddy(const MetadataCache& cache) const;
+  MemoryBlock* left_buddy(const MetadataCache& cache) const;
+  MemoryBlock* right_buddy(const MetadataCache& cache) const;

-  /*! \brief An index to track the allocator */
-  size_t index(MetadataCache& cache) const;
+  // Split the allocation into left/right blocks.
+  void split(MetadataCache* cache, size_t size);

-  /*! \brief The total size of the block */
-  size_t total_size(MetadataCache& cache) const;
+  // Merge left and right blocks together.
+  void merge(MetadataCache* cache, MemoryBlock* right_buddy);

-  /*! \brief Check the left buddy of the block */
-  bool has_left_buddy(MetadataCache& cache) const;
+  // Mark the allocation as free.
+  void mark_as_free(MetadataCache* cache);

-  /*! \brief Check the right buddy of the block */
-  bool has_right_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the left buddy */
-  MemoryBlock* left_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the right buddy */
-  MemoryBlock* right_buddy(MetadataCache& cache) const;
-
- public:
-  /*! \brief Split the allocation into left/right blocks */
-  void split(MetadataCache& cache, size_t size);
+  // Change the type of the allocation.
+  void set_type(MetadataCache* cache, Type t);

-  /*! \brief Merge left and right blocks together */
-  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
-
-  /*! \brief Mark the allocation as free */
-  void mark_as_free(MetadataCache& cache);
-
-  /*! \brief Change the type of the allocation */
-  void set_type(MetadataCache& cache, Type t);
-
- public:
-  /*! \brief Get a pointer to the memory block's data */
  void* data() const;
-
-  /*! \brief Get a pointer to the memory block's metadata */
  MemoryBlock* metadata() const;

+  // MemoryBlock::Desc describes a MemoryBlock.
+  struct Desc {
+    Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+         MemoryBlock* r);
+    Desc();
+
+    // Updates guard_begin and guard_end by hashes of the Metadata object.
+    void update_guards();
+
+    // Checks that guard_begin and guard_end are hashes of the Metadata object.
+    bool check_guards() const;
+
+    // TODO(gangliao): compress this
+    size_t guard_begin = 0;
+    MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
+    size_t index = 0;
+    size_t size = 0;
+    size_t total_size = 0;
+    MemoryBlock* left_buddy = nullptr;
+    MemoryBlock* right_buddy = nullptr;
+    size_t guard_end = 0;
+  };
+};
+
+// A cache for accessing memory block meta-data that may be expensive
+// to access directly.  This class exists to unify the
+// MemoryBlock::Desc format between GPU and CPU allocations. It should
+// be removed when the CPU can access all GPU allocations directly via
+// UVM.
+class MetadataCache {
 public:
-  static size_t overhead();
+  explicit MetadataCache(bool uses_gpu);
+
+  // Disable copying and assignment.
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+  // Returns the MemoryBlock::Desc for a memory block.  When MetadataCache is
+  // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
+  // of the memory block; when used to manage GPU memory, the
+  // Meatadata resides in CPU memory indexed by cache_.
+  MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
+
+  // Saves the MemoryBlock::Desc of a memory block into the cache.  For CPU
+  // memory block, writes the MemoryBlock::Desc to the beginning of the memory
+  // block; whereas for GPU memory, writes it to cache_.
+  void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
+
+  // For GPU memory block, erases its MemoryBlock::Desc from cache_.
+  void invalidate(MemoryBlock* memory_block);
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
+  MetadataMap cache_;
+  bool uses_gpu_;
 };

 }  // namespace detail

--- a/paddle/fluid/memory/detail/meta_data.cc
+++ b/paddle/fluid/memory/detail/meta_data.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/memory/detail/meta_data.h"
-
 #include <functional>

+#include "paddle/fluid/memory/detail/memory_block.h"
+
 namespace paddle {
 namespace memory {
 namespace detail {

-Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                   MemoryBlock* l, MemoryBlock* r)
+MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                        MemoryBlock* l, MemoryBlock* r)
    : type(t),
      index(i),
      size(s),
@@ -29,7 +29,7 @@ Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
      left_buddy(l),
      right_buddy(r) {}

-Metadata::Metadata()
+MemoryBlock::Desc::Desc()
    : type(MemoryBlock::INVALID_CHUNK),
      index(0),
      size(0),
@@ -37,32 +37,36 @@ Metadata::Metadata()
      left_buddy(nullptr),
      right_buddy(nullptr) {}

+namespace {
+
 template <class T>
-inline void hash_combine(std::size_t& seed, const T& v) {
+inline void hash_combine(std::size_t* seed, const T& v) {
  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
 }

-inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
  size_t seed = initial_seed;

-  hash_combine(seed, (size_t)metadata->type);
-  hash_combine(seed, metadata->index);
-  hash_combine(seed, metadata->size);
-  hash_combine(seed, metadata->total_size);
-  hash_combine(seed, metadata->left_buddy);
-  hash_combine(seed, metadata->right_buddy);
+  hash_combine(&seed, static_cast<size_t>(metadata.type));
+  hash_combine(&seed, metadata.index);
+  hash_combine(&seed, metadata.size);
+  hash_combine(&seed, metadata.total_size);
+  hash_combine(&seed, metadata.left_buddy);
+  hash_combine(&seed, metadata.right_buddy);

  return seed;
 }

-void Metadata::update_guards() {
-  guard_begin = hash(this, 1);
-  guard_end = hash(this, 2);
+}  // namespace
+
+void MemoryBlock::Desc::update_guards() {
+  guard_begin = hash(*this, 1);
+  guard_end = hash(*this, 2);
 }

-bool Metadata::check_guards() const {
-  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+bool MemoryBlock::Desc::check_guards() const {
+  return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
 }

 }  // namespace detail

--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/memory/detail/meta_cache.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
@@ -23,29 +22,28 @@ namespace detail {

 MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}

-Metadata MetadataCache::load(const MemoryBlock* block) {
+MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
  if (uses_gpu_) {
-    auto existing_metadata = cache_.find(block);
-    PADDLE_ASSERT(existing_metadata->second.check_guards());
-    return existing_metadata->second;
+    auto existing_desc = cache_.find(block);
+    PADDLE_ASSERT(existing_desc->second.check_guards());
+    return existing_desc->second;
  } else {
-    auto* meta = reinterpret_cast<const Metadata*>(block);
-    VLOG(10) << "Load MetaData type=" << meta->type;
-    PADDLE_ASSERT(meta->check_guards());
-    return *reinterpret_cast<const Metadata*>(block);
+    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
+    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
+    PADDLE_ASSERT(desc->check_guards());
+    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
  }
 }

-void MetadataCache::store(MemoryBlock* block,
-                          const Metadata& original_metadata) {
-  auto metadata = original_metadata;
-
-  metadata.update_guards();
+void MetadataCache::save(MemoryBlock* block,
+                         const MemoryBlock::Desc& original_desc) {
+  auto desc = original_desc;
+  desc.update_guards();

  if (uses_gpu_) {
-    cache_[block] = metadata;
+    cache_[block] = desc;
  } else {
-    *reinterpret_cast<Metadata*>(block) = metadata;
+    *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
  }
 }


--- a/paddle/fluid/memory/detail/meta_cache.h
+++ b/paddle/fluid/memory/detail/meta_cache.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
-
-#include <unordered_map>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-/**
- *  \brief A cache for accessing memory block meta-data that may be expensive
- *         to access directly.
- *
- *  \note  This class exists to unify the metadata format between GPU and CPU
- *         allocations. It should be removed when the CPU can access all GPU
- *         allocations directly via UVM.
- */
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
-
- public:
-  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock* memory_block);
-
-  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock* memory_block, const Metadata& meta_data);
-
-  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock* memory_block);
-
- public:
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
-
- private:
-  bool uses_gpu_;
-
- private:
-  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
-
- private:
-  MetadataMap cache_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/detail/meta_data.h
+++ b/paddle/fluid/memory/detail/meta_data.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-
-#include <stddef.h>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class Metadata {
- public:
-  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-           MemoryBlock* r);
-  Metadata();
-
- public:
-  /*! \brief Update the guards when metadata is changed */
-  void update_guards();
-
-  /*! \brief Check consistency to previous modification */
-  bool check_guards() const;
-
- public:
-  // TODO(gangliao): compress this
-  // clang-format off
-  size_t            guard_begin = 0;
-  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
-  size_t            index       = 0;
-  size_t            size        = 0;
-  size_t            total_size  = 0;
-  MemoryBlock*      left_buddy  = nullptr;
-  MemoryBlock*      right_buddy = nullptr;
-  size_t            guard_end   = 0;
-  // clang-format on
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/assert.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"

 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 #include <algorithm>   // for std::max

 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"

 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
@@ -35,13 +35,13 @@ namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t& index, size_t size) {
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;

-  index = 0;  // unlock memory
+  *index = 0;  // unlock memory

  void* p;

@@ -56,7 +56,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) {

  if (p != nullptr) {
    if (FLAGS_use_pinned_memory) {
-      index = 1;
+      *index = 1;
      mlock(p, size);  // lock memory
    }
  }
@@ -75,7 +75,7 @@ bool CPUAllocator::UseGpu() const { return false; }

 #ifdef PADDLE_WITH_CUDA

-void* GPUAllocator::Alloc(size_t& index, size_t size) {
+void* GPUAllocator::Alloc(size_t* index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
  if (size <= 0) return nullptr;
@@ -93,7 +93,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) {
  }

  if (result == cudaSuccess) {
-    index = 0;
+    *index = 0;
    gpu_alloc_size_ += size;
    return p;
  } else {
@@ -133,7 +133,7 @@ bool GPUAllocator::UseGpu() const { return true; }

 // PINNED memory allows direct DMA transfers by the GPU to and from system
 // memory. It’s locked to a physical address.
-void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
+void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
@@ -154,7 +154,7 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
  cudaError_t result = cudaMallocHost(&p, size);

  if (result == cudaSuccess) {
-    index = 1;  // PINNED memory
+    *index = 1;  // PINNED memory
    cuda_pinnd_alloc_size_ += size;
    return p;
  } else {

--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -29,14 +29,14 @@ namespace detail {
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void* Alloc(size_t* index, size_t size) = 0;
  virtual void Free(void* p, size_t size, size_t index) = 0;
  virtual bool UseGpu() const = 0;
 };

 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;
 };
@@ -46,7 +46,7 @@ class GPUAllocator : public SystemAllocator {
 public:
  explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}

-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;

@@ -58,7 +58,7 @@ class GPUAllocator : public SystemAllocator {

 class CUDAPinnedAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
  virtual void Free(void* p, size_t size, size_t index);
  virtual bool UseGpu() const;


--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -22,11 +22,11 @@ limitations under the License. */

 DECLARE_bool(use_pinned_memory);

-void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
  bool freed = false;
  {
    size_t index;
-    void* p = a.Alloc(index, size);
+    void* p = a->Alloc(&index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size, index);
+      a->Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
  FLAGS_use_pinned_memory = false;
  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }

 TEST(CPUAllocator, LockMem) {
  FLAGS_use_pinned_memory = true;
  paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }

 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
  paddle::memory::detail::GPUAllocator a(0);
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 #endif
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/memory/malloc.h"

 #include "glog/logging.h"

@@ -95,7 +95,7 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
+    platform::GpuMemoryUsage(&avail, &total);
    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                 << place.device << ", available " << avail << " bytes";
    LOG(WARNING) << "total " << total;

--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -12,23 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include "paddle/fluid/memory/malloc.h"
+
+#include <unordered_map>

+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"

-#include <gtest/gtest.h>
-#include <unordered_map>
-
 inline bool is_aligned(void const *p) {
  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }

 size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::CpuMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
@@ -86,7 +85,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
 #ifdef PADDLE_WITH_CUDA

 size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::GpuMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);
@@ -142,7 +141,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
 }

 size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
  size_t remaining = size % alignment;
  return remaining == 0 ? size : size + (alignment - remaining);

--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -14,91 +14,5 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr);
-
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
-};
-
-size_t memory_usage(const platform::Place& p);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-}  // namespace memory
-}  // namespace paddle
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <unordered_map>

 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"


--- a/paddle/fluid/operators/.clang-format
+++ b/paddle/fluid/operators/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
@@ -263,7 +263,7 @@ cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)

--- a/paddle/fluid/operators/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -13,8 +13,8 @@
   limitations under the License. */

 #include "mkldnn.hpp"
-#include "mkldnn_activation_op.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/mkldnn_activation_op.h"

 namespace paddle {
 namespace operators {
@@ -40,18 +40,24 @@ void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
  const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());

  // get memory dim
-  PADDLE_ENFORCE(src->dims().size() == 4,
-                 "Input dim must be with 4, i.e. NCHW");
+  PADDLE_ENFORCE(src->dims().size() == 2 || src->dims().size() == 4,
+                 "Input dim must be with 2 or 4");
  std::vector<int> src_tz = framework::vectorize2int(src->dims());

  // create memory description
-  // TODO(kbinias-intel): support more formats
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                         mkldnn::memory::format::nchw);
+  auto data_md = src_tz.size() == 2
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);

  // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
-  auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
+  auto src_memory =
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(src_data)));
+  auto dst_memory =
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(dst_data)));

  auto forward_desc = mkldnn::eltwise_forward::desc(
      mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
@@ -91,15 +97,21 @@ void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
  std::vector<int> src_tz = framework::vectorize2int(x->dims());

  // create memory description
-  auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                         mkldnn::memory::format::nchw);
+  auto data_md = src_tz.size() == 2
+                     ? platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nc)
+                     : platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
+                                               mkldnn::memory::format::nchw);

  // create memory primitives
-  auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
+  auto src_memory = mkldnn::memory(
+      {data_md, mkldnn_engine}, static_cast<void *>(const_cast<float *>(src)));
  auto diff_src_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_src)));
  auto diff_dst_memory =
-      mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
+      mkldnn::memory({data_md, mkldnn_engine},
+                     static_cast<void *>(const_cast<float *>(diff_dst)));

  auto backward_desc =
      mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);

--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
    // get workspace size able to allocate
    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
    // Allocate on GPU memory
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -5,5 +5,5 @@ if(WITH_DISTRIBUTE)
  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
      cares zlib protobuf sendrecvop_grpc)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
 endif()
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -138,7 +138,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep,
    auto* var = p_scope->FindVar(in_var_name_val);

    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req);
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);

    // var handle
    VarHandle var_h;

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -138,39 +138,48 @@ class RequestPrefetch final : public RequestBase {
                           framework::Scope* scope,
                           const platform::DeviceContext* dev_ctx,
                           framework::Executor* executor,
-                           framework::ProgramDesc* program, int blkid)
+                           framework::ProgramDesc* program,
+                           framework::ExecutorPrepareContext* prefetch_ctx)
      : RequestBase(service, cq, dev_ctx),
        responder_(&ctx_),
        scope_(scope),
        executor_(executor),
        program_(program),
-        blkid_(blkid) {
+        prefetch_ctx_(prefetch_ctx) {
+    request_.reset(new VariableResponse(scope, dev_ctx_));
    int method_id = static_cast<int>(detail::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_,
-                                cq_, this);
+    service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_,
+                                cq_, cq_, this);
  }

  virtual ~RequestPrefetch() {}

-  virtual std::string GetReqName() { return request_.varname(); }
+  virtual std::string GetReqName() { return request_->Varname(); }

  virtual void Process() {
    // prefetch process...
    ::grpc::ByteBuffer reply;
-    // TODO(Yancey1989): execute the Block which containers prefetch ops

-    VLOG(3) << "RequestPrefetch Process in";
+    std::string var_name = request_->OutVarname();
+    auto var_desc = program_->Block(0).FindVar(var_name);
+    framework::Scope* local_scope = &scope_->NewScope();
+    auto* var = local_scope->FindVar(var_name);
+    InitializeVariable(var, var_desc->GetType());
+    executor_->RunPreparedContext(prefetch_ctx_, scope_, false, false);
+
+    SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply);

    responder_.Finish(reply, ::grpc::Status::OK, this);
    status_ = FINISH;
  }

 protected:
-  sendrecv::VariableMessage request_;
+  std::shared_ptr<VariableResponse> request_;
  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
  framework::Scope* scope_;
  framework::Executor* executor_;
  framework::ProgramDesc* program_;
+  framework::ExecutorPrepareContext* prefetch_ctx_;
  int blkid_;
 };

@@ -268,7 +277,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
  }
  RequestPrefetch* prefetch =
      new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_,
-                          executor_, program_, prefetch_blk_id_);
+                          executor_, program_, prefetch_ctx_);

  VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status();
 }

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -63,6 +63,10 @@ class AsyncGRPCServer final {

  void SetExecutor(framework::Executor *executor) { executor_ = executor; }

+  void SetPrefetchPreparedCtx(framework::ExecutorPrepareContext *prepared) {
+    prefetch_ctx_ = prepared;
+  }
+
  int GetSelectedPort() { return selected_port_; }

  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
@@ -111,6 +115,7 @@ class AsyncGRPCServer final {
  std::unique_ptr<std::thread> t_prefetch_;

  int prefetch_blk_id_;
+  framework::ExecutorPrepareContext *prefetch_ctx_;
  framework::ProgramDesc *program_;
  framework::Executor *executor_;
  int selected_port_;

--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
@@ -20,43 +20,121 @@ limitations under the License. */
 #include "paddle/fluid/operators/detail/grpc_client.h"
 #include "paddle/fluid/operators/detail/grpc_server.h"

+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace detail = paddle::operators::detail;

+USE_OP(lookup_table);
+
 std::unique_ptr<detail::AsyncGRPCServer> rpc_service_;

+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
 void StartServer(const std::string& endpoint) {
  rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  auto prepared = exe.Prepare(program, block->ID());
+  InitTensorsOnServer(&scope, &place, 10);
+
+  rpc_service_->SetProgram(&program);
+  rpc_service_->SetPrefetchPreparedCtx(prepared.get());
+  rpc_service_->SetDevCtx(&ctx);
+  rpc_service_->SetScope(&scope);
+  rpc_service_->SetExecutor(&exe);
+
  rpc_service_->RunSyncUpdate();
 }

 TEST(PREFETCH, CPU) {
  // start up a server instance backend
-  // TODO(Yancey1989): Need to start a server with optimize blocks and
-  // prefetch blocks.
  std::thread server_thread(StartServer, "127.0.0.1:8889");
+  sleep(2);
  framework::Scope scope;
  platform::CPUPlace place;
  platform::CPUDeviceContext ctx(place);
  // create var on local scope
-  std::string in_var_name("in");
+  int64_t rows_numel = 5;
+  InitTensorsOnClient(&scope, &place, rows_numel);
+  std::string in_var_name("ids");
  std::string out_var_name("out");
-  auto* in_var = scope.Var(in_var_name);
-  auto* in_tensor = in_var->GetMutable<framework::LoDTensor>();
-  in_tensor->Resize({10, 10});
-  VLOG(3) << "before mutable_data";
-  in_tensor->mutable_data<int>(place);

-  scope.Var(out_var_name);
-
-  VLOG(3) << "before fetch";
  detail::RPCClient client;
  client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name,
                               out_var_name);
  client.Wait();

+  auto var = scope.Var(out_var_name);
+  auto value = var->GetMutable<framework::SelectedRows>()->value();
+  auto ptr = value.mutable_data<float>(place);
+
  rpc_service_->ShutDown();
  server_thread.join();
  rpc_service_.reset(nullptr);
+
+  for (int64_t i = 0; i < rows_numel; ++i) {
+    EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+  }
 }
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ b/paddle/fluid/operators/detail/send_recv.proto
@@ -21,7 +21,7 @@ service SendRecvService {
  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
  // Argument VariableMessage for GetVariable should only contain varname.
  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // Prefetch variable by Ids
+  // pre-fetch variable by given variable name and Ids
  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
 }

@@ -67,6 +67,8 @@ message VariableMessage {
  bytes serialized = 8;
  // selected_rows data
  bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
 }

 message VoidMessage {}
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -30,11 +30,9 @@ namespace detail {

 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg) {
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
  using VarMsg = sendrecv::VariableMessage;
-  sendrecv::VariableMessage request;
-  std::string header;
-  request.AppendToString(&header);
  // When using GPU, need to free the copied CPU buffer
  // when the ByteBuffer destroies
  // TODO(typhoonzero): add unref here, if we have dependent
@@ -52,6 +50,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
    e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
  }

+  if (!out_name.empty()) {
+    e.WriteString(VarMsg::kOutVarnameFieldNumber, out_name);
+  }
  switch (framework::ToVarType(var->Type())) {
    case framework::proto::VarType_Type_LOD_TENSOR: {
      auto tensor = var->Get<framework::LoDTensor>();

--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
@@ -46,7 +46,8 @@ typedef void (*DestroyCallback)(void*);

 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg);
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());

 void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
                               const platform::DeviceContext& ctx,

--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
@@ -416,6 +416,20 @@ int VariableResponse::Parse(Source* source) {
        }
        break;
      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }

      default: {
        // Unknown tag, return unknown error.

--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
@@ -55,6 +55,7 @@ class VariableResponse {
  int Parse(const ::grpc::ByteBuffer& byte_buffer);

  inline std::string Varname() { return meta_.varname(); }
+  inline std::string OutVarname() { return meta_.out_varname(); }

  // should call parse first.
  framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); }

--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -27,8 +27,8 @@ template <typename T>
 class MKLDNNMD {
 public:
  explicit MKLDNNMD(const T* in, const T* w, bool bias)
-      : in{paddle::framework::vectorize2int(in->dims())},
-        w{paddle::framework::vectorize2int(w->dims())} {
+      : in(paddle::framework::vectorize2int(in->dims())),
+        w(paddle::framework::vectorize2int(w->dims())) {
    with_bias_ = bias;
  }

@@ -78,7 +78,7 @@ class MKLDNNMD {
 class MKLDNNMemory {
 public:
  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
-      : md_{t}, engine_{e} {}
+      : md_(t), engine_(e) {}
  virtual ~MKLDNNMemory() = default;

  template <typename Output>

--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

@@ -35,7 +37,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
      if (lod_t->lod().size() > 0) {
        auto y_lod = lod_t->lod();
        auto last_level = y_lod[y_lod.size() - 1];
-        PADDLE_ENFORCE_EQ(last_level.back(), in->dims()[0],
+        PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
                          "Last value of `Y`'s last level LoD should be equal "
                          "to the first dimension of `X`");
        out->set_lod(y_lod);

--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,22 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-static inline framework::OpKernelType ExpectedKernelType(
-    const framework::ExecutionContext& ctx) {
-  auto* table_var = ctx.InputVar("W");
-  if (table_var->IsType<LoDTensor>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<LoDTensor>().type()),
-        ctx.device_context());
-  } else if (table_var->IsType<SelectedRows>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
-        ctx.device_context());
-  } else {
-    PADDLE_THROW("W should be LoDTensor or SelectedRows");
-  }
-}
-
 class LookupTableOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
  }
 };

@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
  }
 };


--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;

-static constexpr int64_t kNoPadding = -1;
-
-inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
-  auto it = std::find(rows.begin(), rows.end(), value);
-  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
-  return static_cast<size_t>(std::distance(rows.begin(), it));
-}
+constexpr int64_t kNoPadding = -1;

 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
    }

    int64_t *ids;
@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = getIndex(table_t.rows(), ids[i]);
+          auto id_index = table_t.index(ids[i]);
          memcpy(output + i * row_width, table + id_index * row_width,
                 row_width * sizeof(T));
        }
@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
    }

    bool is_sparse = context.Attr<bool>("is_sparse");

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
  cublasOperation_t cuTransB =
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);

  // TODO(kexinzhao): add processing code for compute capability < 53 case
  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_DEFAULT_MATH));
+  }
+#endif
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
 }

 template <>

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,6 +14,8 @@ limitations under the License. */

 #define EIGEN_USE_GPU

+#include <vector>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;

+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct ValueClip {
  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
+    const T kThreshold = static_cast<T>(-64.);
    return x < kThreshold ? kThreshold : x;
  }
 };

--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };

@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
                  paddle::framework::EmptyGradOpMaker);

-REGISTER_OP_CPU_KERNEL(
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
+                       ops::PriorBoxOpKernel<double>);
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
+                            const int width, const int im_height,
+                            const int im_width, const int as_num,
+                            const T offset, const T step_width,
+                            const T step_height, const T* min_sizes,
+                            const T* max_sizes, const int min_num,
+                            bool is_clip) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (s < as_num) {
+        T ar = aspect_ratios[s];
+        bw = min_size * sqrt(ar) / 2.;
+        bh = min_size / sqrt(ar) / 2.;
+      } else {
+        T max_size = max_sizes[m];
+        bw = sqrt(min_size * max_size) / 2.;
+        bh = bw;
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto im_width = image->dims()[3];
+    auto im_height = image->dims()[2];
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(im_width) / width;
+      step_height = static_cast<T>(im_height) / height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+    int min_num = static_cast<int>(min_sizes.size());
+    int box_num = width * height * num_priors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor r;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
+
+    framework::Tensor min;
+    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
+
+    T* max_data = nullptr;
+    framework::Tensor max;
+    if (max_sizes.size() > 0) {
+      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
+      max_data = max.data<T>();
+    }
+
+    GenPriorBox<T><<<grid, block, 0, stream>>>(
+        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
+        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
+        max_data, min_num, clip);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
+                        ops::PriorBoxOpCUDAKernel<double>);
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -51,7 +51,7 @@ struct ClipFunctor {
  }
 };

-template <typename Place, typename T>
+template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size / 2.;
-          // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-          // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-
-          idx++;
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            // xmin
+          // priors with different aspect ratios
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
            idx++;
          }
-
-          // rest of priors
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            // xmin
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
            idx++;
          }

--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
    out->Reset(
        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
  }

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();

    auto place_str = Attr<std::string>("place");
    platform::Place place;

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
-    auto& out = detail::Ref(scope.FindVar(Output("Out")));
    int pass_num = Attr<int>("pass_num");
-    out.GetMutable<framework::ReaderHolder>()->Reset(
-        new MultiPassReader(underlying_reader.Get(), pass_num));
+    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
  }
 };


--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
-    auto& var = detail::Ref(scope.FindVar(Output("Out")));
-    var.GetMutable<framework::ReaderHolder>()->Reset(
+    out->Reset(
        new ShuffleReader(underlying_reader.Get(),
                          static_cast<size_t>(Attr<int>("buffer_size"))));
  }

--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
--- a/paddle/fluid/platform/.clang-format
+++ b/paddle/fluid/platform/.clang-format
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/pybind/.clang-format
+++ b/paddle/fluid/pybind/.clang-format
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/fluid/pybind/tensor_py_test.cc
+++ b/paddle/fluid/pybind/tensor_py_test.cc
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
--- a/paddle/fluid/string/.clang-format
+++ b/paddle/fluid/string/.clang-format
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ b/paddle/gserver/tests/test_Upsample.cpp
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/debuger.py
+++ b/python/paddle/fluid/debuger.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/distributed_spliter.py
+++ b/python/paddle/fluid/distributed_spliter.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
--- a/python/setup.py.in
+++ b/python/setup.py.in