Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into modify_readers_to_fit_parallel_executor

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into modify_readers_to_fit_parallel_executor
6be51f10 · fengjiayi · 49ab52d6 · 3874c383 · 6be51f10 · 6be51f10
165 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 # clion workspace.
 cmake-build-*
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
 add_dependencies(snappystream extern_snappystream)
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 ExternalProject_Add(
    extern_zlib

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -251,7 +251,7 @@ function(cc_test TARGET_NAME)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction(cc_test)
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction()

--- a/doc/design/file_manager/README.md
+++ b/doc/design/file_manager/README.md
-# FileManager设计文档
-## 目标
-在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
-主要功能包括：
- 提供常用的命令行管理命令管理文件和目录
- 支持大文件的断点上传、下载  
-## 名词解释
- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
- Chunk：逻辑划上文件分块的单位。
-## 模块
-### 架构图
-<image src=./src/filemanager.png width=900>
-### PFSClient
- 功能： 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
- 双向验证   
-	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
- 功能：  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-### PFSServer
-PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
-RESTful API
- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-## 文件传输优化
-### 分块文件传输
-用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-一个典型的Chunk如下所示：
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
-## 参考文档
- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
- [linux man document](https://linux.die.net/man/)
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ b/doc/design/file_manager/pfs/pfsclient.md
-# PFSClient
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-## Synopsis
-```
-paddle [options] pfs <subcommand> [parameters]
-```
-## Options
-```
--profile (string)
-	Use a specific profile from your credential file.
--help (string)
-	Display more information about command
--version
-	Output version information and exit
--debug
-	Show detailed debugging log	
--only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-## Subcommonds
- rm - remove files or directories
-```
-Synopsis:
-	rm [-r] [-v] <PFSPath> ...
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
- mv - move (rename) files
-```
-Synopsis:
-	mv [-f | -n] [-v] <LocalPath> <PFSPath>
-	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
-	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
- cp - copy files or directories
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
- ls- list files
-```
-Synopsis:
-	ls [-r] <PFSPath> ...
-Options:
-	-R
-   		List directory(ies) recursively
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-```
-Synopsis:
-	mkdir <PFSPath> ...
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
--- a/doc/design/file_manager/src/filemanager.graffle
+++ b/doc/design/file_manager/src/filemanager.graffle
--- a/doc/design/file_manager/src/filemanager.png
+++ b/doc/design/file_manager/src/filemanager.png
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
 add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -5,10 +5,10 @@ In a large scale machine learning setup where the size of the training data is h
 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/asgd.gif"><br />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
 </p>
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.

--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@@ -114,13 +114,13 @@ current thread under two conditions:
 #### Channel Send
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/channel_send.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
 </p>
 #### Channel Receive
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/channel_recv.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
 </p>
 ## Limitations and Considerations

--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -23,21 +23,25 @@ The following table compares concepts in Fluid and Go
 <td>user-defined functions </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
 </tr>
 <tr>
 <td>control-flow and built-in functions </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<td></td>
 </tr>
 <tr>
 <td>goroutines, channels </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
 </tr>
 <tr>
 <td>runtime </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<td></td>
 </tr>
 </tbody>
 </table>

--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -254,7 +254,7 @@ only one case will be executed.
 ### select_op flow
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/select_op_workflow.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
 </p>
 The select algorithm is inspired by golang's select routine.  Please refer to

--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/compiler.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/paddle-compile.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/distributed_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/local_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
 ### Training Data

--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 ## Transpiler
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/single-thread@3x.png" width="300">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
 After converted:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
 ## Implement

--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/local-graph.png" width="300"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
 After converting:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/dist-graph.png" width="700"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/sparse_update.png" width="700" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
 ### Benefits
 - Model parallelism becomes easier to implement: it is an extension to

--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.jpg"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
 </p>
 The above diagram shows an RNN unrolled into a full network.

--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 The following graph showes the training computational process of `batch_norm_op`:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 <div align=center>
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
 </div>
 Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.

--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -6,17 +6,17 @@ A central problem in machine learning is how to design an algorithm that will pe
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
 ##### L2 Regularization:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
 ##### L1 Regularization
-<img src=".https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
 A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
@@ -40,11 +40,11 @@ The idea of building ops for regularization is in sync with the refactored Paddl
 Below is an example of a really simple feed forward neural network.
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
 ### Python API implementation for Regularization

--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -116,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
 - **One** CTC-loss layer
 <div align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/ds2_network.png" width=350><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
 Figure 1. Archetecture of Deep Speech 2 Network.
 </div>
@@ -208,7 +208,7 @@ TODO by Assignees
 ### Beam Search with CTC and LM
 <div align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/beam_search.png" width=600><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>

--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i
 ## LoD and shape changes during decoding
 <p align="center">
-  <img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
 According to the image above, the only phase that changes the LoD is beam search.

--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@@ -7,14 +7,14 @@ It applies several important concepts in machine learning system design, includi
 In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
 Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
 </p>
 The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
 <p align="center">
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
 Figure 2. Photo borrowed from the original DC-GAN paper.
 </p>

--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -9,5 +9,5 @@
  use_eigen_cn.md
  name_convention.md
  support_new_device.md
-  releasing_process.md
+  releasing_process_cn.md
  op_markdown_format.md
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -9,5 +9,5 @@ Development
  use_eigen_en.md
  name_convention.md
  support_new_device.md
-  releasing_process.md
+  releasing_process_en.md
  op_markdown_format.md
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本，遵循以下流程:
  * 使用Regression Test List作为检查列表，测试本次release的正确性。
 	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
+	* 将这个版本的python wheel包发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+	* 更新Docker镜像（参考后面的操作细节）。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
-		* 上传方法：
+1. 协同完成Release Note的书写。
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
 需要注意的是:
@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 ## 发布wheel包到pypi
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
-使用`twine`工具上传即可。
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/ci_build_whl.png">
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可：
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+```
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
+docker pull [镜像]:latest
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+docker tag [镜像]:latest [镜像]:[version]
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+docker push [镜像]:[version]
+```
+需要更新的镜像tag包括：
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 ## PaddlePaddle 分支规范
@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 ### PaddlePaddle Book中所有章节
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
 <table>
 <thead>

--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
+# PaddlePaddle Releasing Process
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+***NOTE:***
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+## Publish Wheel Packages to pypi
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+## Publish Docker Images
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+## Branching Model
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+## PaddlePaddle Regression Test List
+### All Chapters of PaddlePaddle Book
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program?  There many C++ A
 The overall flow is shown as the following figure.
-<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/profiler.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
 ### Event

--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
 add_subdirectory(api)
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
    ${START_END}
 )
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
    DEPENDS _swig_paddle
 )
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)

--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)

--- a/paddle/fluid/framework/.clang-format
+++ b/paddle/fluid/framework/.clang-format
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
@@ -96,6 +97,8 @@ class BlockDesc {
   */
  void RemoveOp(size_t s, size_t e);
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
  std::vector<OpDesc *> AllOps() const;
  size_t OpSize() const { return ops_.size(); }

--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #pragma once
-#include <stddef.h>  // for size_t
+#include <stddef.h>            // for size_t
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"
@@ -216,7 +216,8 @@ class ChannelHolder {
  template <typename T>
  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
      channel_.reset(MakeChannel<T>(buffer_size));
    }

--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>  // for size_t
 #include <atomic>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  virtual void Unlock();
  virtual bool IsClosed();
  virtual void Close();
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
  virtual ~ChannelImpl();
  virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
    const void *referrer;  // TODO(thuan): figure out better way to do this
    std::function<bool(ChannelAction)> callback;
-    QueueMessage(T *item)
+    explicit QueueMessage(T *item)
        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
  }
  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
+      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
-    while (!queue.empty()) {
+    while (!queue->empty()) {
      // Check whether this message was added by Select
      // If this was added by Select then execute the callback
      // to check if you can execute this message. The callback
      // can return false if some other case was executed in Select.
      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue.front();
+      std::shared_ptr<QueueMessage> m = queue->front();
-      queue.pop_front();
+      queue->pop_front();
      if (m->callback == nullptr || m->callback(action)) return m;
    }
    return nullptr;
@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) {
  // to send to the receiver, bypassing the channel buffer if any
  if (!recvq.empty()) {
    std::shared_ptr<QueueMessage> m =
-        get_first_message(recvq, ChannelAction::SEND);
+        get_first_message(&recvq, ChannelAction::SEND);
    if (m != nullptr) {
      *(m->data) = std::move(*item);
@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) {
  // buffer and move front of send queue to the buffer
  if (!sendq.empty()) {
    std::shared_ptr<QueueMessage> m =
-        get_first_message(sendq, ChannelAction::RECEIVE);
+        get_first_message(&sendq, ChannelAction::RECEIVE);
    if (buf_.size() > 0) {
      // Case 1 : Channel is Buffered
      // Do Data transfer from front of buffer
@@ -219,8 +219,9 @@ bool ChannelImpl<T>::Receive(T *item) {
      if (m != nullptr) {
        *item = std::move(*(m->data));
        m->Notify();
-      } else
+      } else {
        return recv_return(Receive(item));
+      }
    }
    return recv_return(true);
  }

--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/channel.h"
-#include <chrono>
+#include <chrono>  // NOLINT
-#include <thread>
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 using paddle::framework::Channel;
@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  std::thread t([&]() {
    // Try to write more than buffer size.
    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
+      if (i < buffer_size) {
        ch->Send(&i);  // should block after 10 iterations
-      else {
+      } else {
        bool is_exception = false;
        try {
          ch->Send(&i);
@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
 }
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
-  bool send_success[num_threads];
+  bool send_success[kNumThreads];
  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (!thread_ended[i]) ct++;
    }
    EXPECT_GE(ct, 4);
  } else {
    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (send_success[i]) ct++;
    }
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 // This tests that closing a buffered channel also unblocks
@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
-  bool send_success[num_threads];
+  bool send_success[kNumThreads];
  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (thread_ended[i] == false) ct++;
    }
    // Atleast 4 threads must be blocked
    EXPECT_GE(ct, 4);
  } else {
    // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  // Count number of successful sends
  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    if (send_success[i]) ct++;
  }
@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
  }
  // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // delete the channel
  delete ch;
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }
 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
-  bool send_success[num_threads];
+  bool send_success[kNumThreads];
  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  if (isBuffered) {
    // If ch is Buffered, atleast 4 threads must be blocked.
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (!thread_ended[i]) ct++;
    }
    EXPECT_GE(ct, 4);
  } else {
    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  if (isBuffered) {
    // Verify that only 1 send was successful
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (send_success[i]) ct++;
    }
    // Only 1 send must be successful
    EXPECT_EQ(ct, 1);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 // This tests that closing a channelholder unblocks
@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
-  bool send_success[num_threads];
+  bool send_success[kNumThreads];
  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    send_success[i] = false;
    t[i] = std::thread(
@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  if (isBuffered) {
    // If channel is buffered, verify that atleast 4 threads are blocked
    int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      if (thread_ended[i] == false) ct++;
    }
    // Atleast 4 threads must be blocked
    EXPECT_GE(ct, 4);
  } else {
    // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
      EXPECT_EQ(thread_ended[i], false);
    }
  }
@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  // Count number of successfuld sends
  int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    if (send_success[i]) ct++;
  }
@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
  }
  // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
+  const size_t kNumThreads = 5;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // delete the channel
  delete ch;
  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
  // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
 // This tests that closing a channelholder many times.
 void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int num_threads = 15;
+  const int kNumThreads = 15;
-  std::thread t[num_threads];
+  std::thread t[kNumThreads];
-  bool thread_ended[num_threads];
+  bool thread_ended[kNumThreads];
  // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < num_threads / 3; i++) {
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *ended) {
@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  }
  // Launches threads that try to receive data to channel.
-  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  }
  // Launches threads that try to close the channel.
-  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
  // Verify that all threads are unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  EXPECT_TRUE(ch->IsClosed());
  // delete the channel
  delete ch;
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
    return (lod_)[level].size() - 1;
  }
+  // Split LoDTensor and copy to each place specified in places.
  std::vector<LoDTensor> SplitLoDTensor(
      const std::vector<platform::Place> places) const;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+  if (var->IsType<framework::LoDTensor>()) {
+    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return framework::ToDataType(
+        var->Get<framework::SelectedRows>().value().type());
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+}
 static DDim GetDims(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;
 }
+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
 class OperatorBase;
 class ExecutionContext;

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -150,13 +150,30 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
-void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+void ParallelExecutor::Run(
-                           const std::string &fetched_var_name) {
+    const std::vector<std::string> &fetch_tensors,
+    const std::string &fetched_var_name,
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
+  SplitTensorToPlaces(feed_tensors);
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
 }
+void ParallelExecutor::SplitTensorToPlaces(
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+  for (auto it : feed_tensors) {
+    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
+      // TODO(panxy0718): Do I need to delete this var?
+      member_->local_scopes_[j]
+          ->Var(it.first)
+          ->GetMutable<LoDTensor>()
+          ->ShareDataWith(lod_tensors[j]);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,9 +42,13 @@ class ParallelExecutor {
                            bool allow_op_delay);
  void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name = "fetched_var");
+           const std::string& fetched_var_name,
+           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 private:
+  void SplitTensorToPlaces(
+      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
  ParallelExecutorPrivate* member_;
  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;

--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                       const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,6 +50,15 @@ class SelectedRows {
  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+  /**
+   * get the index of id in rows
+   */
+  int64_t index(int64_t id) const {
+    auto it = std::find(rows_.begin(), rows_.end(), id);
+    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
    dims[0] = height_;

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
    if (platform::is_cpu_place(place)) {
      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place)) {
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
    }
 #else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+      if (platform::is_gpu_place(place)) {
-          boost::get<platform::CUDAPlace>(place), size, type));
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
    }
 #endif
    offset_ = 0;
@@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
 inline void* Tensor::mutable_data(platform::Place place) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
+                 "Cannot invoke mutable data if current hold nothing.");
  return mutable_data(place, holder_->type());
 }

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
 public:
  using ElementVars = std::vector<ElementVar>;
-  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+  Tuple(const std::vector<ElementVar>& var,
+        const std::vector<VarDesc>& var_desc)
      : var_(var), var_desc_(var_desc) {}
-  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
-  ElementVar get(int idx) const { return var_[idx]; };
+  ElementVar get(int idx) const { return var_[idx]; }
-  ElementVar& get(int idx) { return var_[idx]; };
+  ElementVar& get(int idx) { return var_[idx]; }
-  bool isSameType(Tuple& t) const;
+  bool isSameType(const Tuple& t) const;
-  size_t getSize() const { return var_.size(); };
+  size_t getSize() const { return var_.size(); }
 private:
  ElementVars var_;
  std::vector<VarDesc> var_desc_;
 };
-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
  size_t tuple_size = getSize();
  if (tuple_size != t.getSize()) {
    return false;

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
  return false;
 }
-void LoadPersistables(framework::Executor& executor,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
-                      framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
                      const std::string& param_filename) {
@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
+    framework::Executor& executor, framework::Scope& scope,
-    framework::Scope& scope,
+    const std::string& prog_filename, const std::string& param_filename) {
-    const std::string& prog_filename,
-    const std::string& param_filename) {
  std::string model_filename = prog_filename;
  std::string program_desc_str;
  ReadBinaryFile(model_filename, program_desc_str);

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
-void LoadPersistables(framework::Executor& executor,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
-                      framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,
                      const std::string& param_filename);

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
  set(multiValueArgs ARGS)
  cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
  set(arg_list "")
  if(inference_test_ARGS)
    foreach(arg ${inference_test_ARGS})

--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
  // The second dim of the input tensor should be 13
  // The input data should be >= 0
  int64_t batch_size = 10;
-  SetupTensor<float>(
+  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+                     static_cast<float>(10));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);

--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
+  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
-                     {FLAGS_batch_size, 3, 32, 32},
+                     static_cast<float>(0), static_cast<float>(1));
-                     static_cast<float>(0),
-                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
  // Run inference on CPU
  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(
+  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+                                            FLAGS_repeat);
  LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
  // Run inference on CUDA GPU
  LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(
+  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+                                             FLAGS_repeat);
  LOG(INFO) << output2.dims();
  CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
  int64_t predicate_dict_len = 3162;
  int64_t mark_dict_len = 2;
-  SetupLoDTensor(word,
+  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
+  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
+  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
+  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
+  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
+  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
+  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
+  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(mark_dict_len - 1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;

--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
  paddle::framework::LoDTensor input;
  // Use normilized image pixels as input data,
  // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
+  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
-                     {FLAGS_batch_size, 1, 28, 28},
+                     static_cast<float>(-1), static_cast<float>(1));
-                     static_cast<float>(-1),
-                     static_cast<float>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&input);
@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(
+    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+                                              FLAGS_repeat, is_combined);
    LOG(INFO) << output1.dims();
 #ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
    // Run inference on CUDA GPU
    LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(
+    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
-        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+                                               FLAGS_repeat, is_combined);
    LOG(INFO) << output2.dims();
    CheckError<float>(output1, output2);

--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
  // Use the first data from paddle.dataset.movielens.test() as input
  std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
  std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
  std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
  std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
  std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
  std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
  std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&user_id);

--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
  paddle::framework::LoDTensor word_data, trg_word;
  paddle::framework::LoD lod{{0, 4, 10}};
-  SetupLoDTensor(
+  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+                 static_cast<int64_t>(1));
-  SetupLoDTensor(
+  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+                 static_cast<int64_t>(1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&word_data);

--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
  paddle::framework::LoD lod{{0, 4, 10}};
  int64_t word_dict_len = 5147;
-  SetupLoDTensor(words,
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
-                 lod,
-                 static_cast<int64_t>(0),
                 static_cast<int64_t>(word_dict_len - 1));
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;

--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
  paddle::framework::LoD lod{{0, 1}};
  int64_t dict_size = 2073;  // The size of dictionary
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
  cpu_feeds.push_back(&first_word);

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+#include <map>
+#include <random>
+#include <string>
+#include <vector>
-#include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
+void SetupTensor(paddle::framework::LoDTensor* input,
-                 paddle::framework::DDim dims,
+                 paddle::framework::DDim dims, T lower, T upper) {
-                 T lower,
+  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
-                 T upper) {
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
+  for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] =
+    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
  }
 }
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
+void SetupTensor(paddle::framework::LoDTensor* input,
-                 paddle::framework::DDim dims,
+                 paddle::framework::DDim dims, const std::vector<T>& data) {
-                 std::vector<T>& data) {
  CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
-                    paddle::framework::LoD& lod,
+                    const paddle::framework::LoD& lod, T lower, T upper) {
-                    T lower,
+  input->set_lod(lod);
-                    T upper) {
-  input.set_lod(lod);
  int dim = lod[0][lod[0].size() - 1];
  SetupTensor<T>(input, {dim, 1}, lower, upper);
 }
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                    paddle::framework::DDim dims,
-                    paddle::framework::LoD lod,
+                    const paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const std::vector<T>& data) {
  const size_t level = lod.size() - 1;
  CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
  SetupTensor<T>(input, dims, data);
 }
 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
+void CheckError(const paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+                const paddle::framework::LoDTensor& output2) {
  // Check lod information
  EXPECT_EQ(output1.lod(), output2.lod());
@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1,
 template <typename Place>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1,
+                   const int repeat = 1, const bool is_combined = false) {
-                   const bool is_combined = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
@@ -132,11 +131,9 @@ void TestInference(const std::string& dirname,
      //  `fluid.io.save_inference_model`.
      std::string prog_filename = "__model_combined__";
      std::string param_filename = "__params_combined__";
-      inference_program =
+      inference_program = paddle::inference::Load(
-          paddle::inference::Load(executor,
+          executor, *scope, dirname + "/" + prog_filename,
-                                  *scope,
+          dirname + "/" + param_filename);
-                                  dirname + "/" + prog_filename,
-                                  dirname + "/" + param_filename);
    } else {
      // Parameters are saved in separate files sited in the specified
      // `dirname`.

--- a/paddle/fluid/memory/.clang-format
+++ b/paddle/fluid/memory/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -95,7 +95,7 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
+    platform::GpuMemoryUsage(&avail, &total);
    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                 << place.device << ", available " << avail << " bytes";
    LOG(WARNING) << "total " << total;

--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
+#include <unordered_map>
+#include "gtest/gtest.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
-#include <gtest/gtest.h>
-#include <unordered_map>
 inline bool is_aligned(void const *p) {
  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }

--- a/paddle/fluid/operators/.clang-format
+++ b/paddle/fluid/operators/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
    # op_library is a function to create op library. The interface is same as
    # cc_library. But it handle split GPU/CPU code and link some common library
@@ -193,6 +193,7 @@ if(WITH_DISTRIBUTE)
    set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
 else()
    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)

--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
        workspace_size_limit, &algo));
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
    // get workspace size able to allocate
    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
    // Allocate on GPU memory
    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -186,7 +186,8 @@ void AsyncGRPCServer::WaitClientGet(int count) {
 void AsyncGRPCServer::RunSyncUpdate() {
  ::grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
+  builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
+                           &selected_port_);
  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
  builder.RegisterService(&service_);
@@ -196,7 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
  cq_prefetch_ = builder.AddCompletionQueue();
  server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_ << std::endl;
+  LOG(INFO) << "Server listening on " << address_
+            << " selected port: " << selected_port_;
  std::function<void()> send_register =
      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);

--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
@@ -63,6 +63,8 @@ class AsyncGRPCServer final {
  void SetExecutor(framework::Executor *executor) { executor_ = executor; }
+  int GetSelectedPort() { return selected_port_; }
  const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
  void Push(const std::string &msg_name) {
@@ -111,6 +113,7 @@ class AsyncGRPCServer final {
  int prefetch_blk_id_;
  framework::ProgramDesc *program_;
  framework::Executor *executor_;
+  int selected_port_;
 };
 };  // namespace detail

--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -27,8 +27,8 @@ template <typename T>
 class MKLDNNMD {
 public:
  explicit MKLDNNMD(const T* in, const T* w, bool bias)
-      : in{paddle::framework::vectorize2int(in->dims())},
+      : in(paddle::framework::vectorize2int(in->dims())),
-        w{paddle::framework::vectorize2int(w->dims())} {
+        w(paddle::framework::vectorize2int(w->dims())) {
    with_bias_ = bias;
  }
@@ -78,7 +78,7 @@ class MKLDNNMD {
 class MKLDNNMemory {
 public:
  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
-      : md_{t}, engine_{e} {}
+      : md_(t), engine_(e) {}
  virtual ~MKLDNNMemory() = default;
  template <typename Output>

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,20 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <stdint.h>
 #include <ostream>
+#include <thread>
-#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
 namespace paddle {
 namespace operators {
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
  service->RunSyncUpdate();
  VLOG(4) << "RunServer thread end";
@@ -66,143 +60,138 @@ static void ParallelExecuteBlocks(
  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
-class ListenAndServOp : public framework::OperatorBase {
+ListenAndServOp::ListenAndServOp(const std::string &type,
- public:
+                                 const framework::VariableNameMap &inputs,
-  ListenAndServOp(const std::string &type,
+                                 const framework::VariableNameMap &outputs,
-                  const framework::VariableNameMap &inputs,
+                                 const framework::AttributeMap &attrs)
-                  const framework::VariableNameMap &outputs,
+    : OperatorBase(type, inputs, outputs, attrs) {}
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    }
-  }
-  void Stop() override {
+int ListenAndServOp::GetSelectedPort() {
-    rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  return rpc_service_->GetSelectedPort();
-    server_thread_->join();
+}
+void ListenAndServOp::Stop() {
+  rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
+  server_thread_->join();
+}
+void ListenAndServOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+  if (!rpc_service_) {
+    std::string endpoint = Attr<std::string>("endpoint");
+    rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
  }
-  void RunImpl(const framework::Scope &scope,
+  auto ins = Inputs("X");
-               const platform::Place &dev_place) const override {
+  auto fan_in = Attr<int>("Fanin");
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto &dev_ctx = *pool.Get(dev_place);
+  auto *program = block->Program();
-    framework::Scope &recv_scope = scope.NewScope();
+  size_t num_blocks = program->Size();
+  PADDLE_ENFORCE_GE(num_blocks, 2,
-    // FIXME(Yancey1989): initialize rpc server with lazy mode.
+                    "server program should have at least 2 blocks");
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
+  framework::Executor executor(dev_place);
-    auto ins = Inputs("X");
+  std::vector<int> block_list;
-    auto fan_in = Attr<int>("Fanin");
+  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+    block_list.push_back(blkid);
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  }
-    auto *program = block->Program();
+  auto prepared = executor.Prepare(*program, block_list);
-    size_t num_blocks = program->Size();
+  // Insert placeholder for block0 which holds current op itself.
-    PADDLE_ENFORCE_GE(num_blocks, 2,
+  prepared.insert(prepared.begin(),
-                      "server program should have at least 2 blocks");
+                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-    framework::Executor executor(dev_place);
+  rpc_service_->SetScope(&recv_scope);
-    std::vector<int> block_list;
+  rpc_service_->SetDevCtx(&dev_ctx);
-    for (size_t blkid = 1; blkid < num_blocks; ++blkid)
+  // TODO(qiao) set proper fields for table lookup and update
-      block_list.push_back(blkid);
+  rpc_service_->SetExecutor(&executor);
-    auto prepared = executor.Prepare(*program, block_list);
+  rpc_service_->SetPrefetchBlkdId(0);
-    prepared.insert(
+  rpc_service_->SetProgram(program);
-        prepared.begin(),
+  // start the server listening after all member initialized.
-        std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  // FIXME(typhoonzero): do we need to wait until the server port is ready?
-    // TODO(qiao) set proper fields for table lookup and update
+  sleep(5);
-    rpc_service_->SetExecutor(&executor);
-    rpc_service_->SetPrefetchBlkdId(0);
+  // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    rpc_service_->SetProgram(program);
+  bool exit_flag = false;
+  // Record received sparse variables, so that
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+  // we could reset those after execute optimize program
-    bool exit_flag = false;
+  std::vector<framework::Variable *> sparse_vars;
-    // Record received sparse variables, so that
+  while (!exit_flag) {
-    // we could reset those after execute optimize program
+    // Get from multiple trainers, we don't care about the order in which
-    std::vector<framework::Variable *> sparse_vars;
+    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    while (!exit_flag) {
+    rpc_service_->SetCond(0);
-      // Get from multiple trainers, we don't care about the order in which
+    size_t recv_var_cnt = 0;
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
+    int batch_barrier = 0;
-      rpc_service_->SetCond(0);
+    while (batch_barrier != fan_in) {
-      size_t recv_var_cnt = 0;
+      const detail::ReceivedMessage v = rpc_service_->Get();
-      int batch_barrier = 0;
+      auto recv_var_name = v.first;
-      while (batch_barrier != fan_in) {
+      if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-        const detail::ReceivedMessage v = rpc_service_->Get();
+        LOG(INFO) << "received terminate message and exit";
-        auto recv_var_name = v.first;
+        exit_flag = true;
-        if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          VLOG(3) << "received grad: " << recv_var_name;
-          recv_var_cnt++;
-          auto var = v.second->GetVar();
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          if (var->IsType<framework::SelectedRows>()) {
-            sparse_vars.push_back(var);
-          }
-        }
-      }
-      if (exit_flag) {
-        rpc_service_->SetCond(1);
-        rpc_service_->ShutDown();
        break;
-      }
+      } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
+        VLOG(3) << "recv batch barrier message";
-      // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
+        batch_barrier++;
-      // and this will still work.
+        continue;
+      } else {
-      // The optimize blocks which have the same parent ID would run parallel
+        VLOG(3) << "received grad: " << recv_var_name;
-      // TODO(Yancey1989): need to use ParallelExecutor for future
+        recv_var_cnt++;
-      int32_t last_parent_blkid = program->Block(1).Parent();
+        auto var = v.second->GetVar();
-      std::vector<size_t> parallel_blkids;
+        if (var == nullptr) {
-      parallel_blkids.push_back(1);
+          LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-      double ts = detail::GetTimestamp();
+          PADDLE_THROW("Can not find server side var");
-      for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+        }
-        if (program->Block(blkid).Parent() != last_parent_blkid) {
+        if (var->IsType<framework::SelectedRows>()) {
-          for (size_t idx : parallel_blkids) VLOG(3) << idx;
+          sparse_vars.push_back(var);
-          ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                                &recv_scope);
-          parallel_blkids.clear();
-          last_parent_blkid = program->Block(blkid).Parent();
        }
-        parallel_blkids.push_back(blkid);
-      }
-      ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                            &recv_scope);
-      VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
-              << "(ms)";
-      // Reset the received sparse variables, the sum operator would not
-      // sum the input sparse variables which rows is empty at the next
-      // mini-batch.
-      // TODO(Yancey1989): move the reset action into an operator, we couldn't
-      // have any hide logic in the operator.
-      for (auto &var : sparse_vars) {
-        var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
      }
+    }
+    if (exit_flag) {
      rpc_service_->SetCond(1);
-      // NOTE: does not consider barrier request retry in here, we may use
+      rpc_service_->ShutDown();
-      // global barrier id to resolve this.
+      break;
-      rpc_service_->WaitClientGet(fan_in);
+    }
-      sparse_vars.clear();
-    }  // while(true)
-  }
- protected:
+    // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+    // and this will still work.
-  std::shared_ptr<std::thread> server_thread_;
-};
+    // The optimize blocks which have the same parent ID would run parallel
+    // TODO(Yancey1989): need to use ParallelExecutor for future
+    int32_t last_parent_blkid = program->Block(1).Parent();
+    std::vector<size_t> parallel_blkids;
+    parallel_blkids.push_back(1);
+    double ts = detail::GetTimestamp();
+    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+                              &recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
+      }
+      parallel_blkids.push_back(blkid);
+    }
+    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+                          &recv_scope);
+    VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
+    // Reset the received sparse variables, the sum operator would not
+    // sum the input sparse variables which rows is empty at the next
+    // mini-batch.
+    // TODO(Yancey1989): move the reset action into an operator, we couldn't
+    // have any hide logic in the operator.
+    for (auto &var : sparse_vars) {
+      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+    }
+    rpc_service_->SetCond(1);
+    // FIXME(typhoonzero): use another condition to sync wait clients get.
+    rpc_service_->WaitClientGet(fan_in);
+    sparse_vars.clear();
+  }  // while(true)
+}
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 public:

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <stdint.h>
+#include <ostream>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/grpc_server.h"
+namespace paddle {
+namespace operators {
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs);
+  int GetSelectedPort();
+  void Stop() override;
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override;
+ protected:
+  mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  mutable std::shared_ptr<std::thread> server_thread_;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,22 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
-static inline framework::OpKernelType ExpectedKernelType(
-    const framework::ExecutionContext& ctx) {
-  auto* table_var = ctx.InputVar("W");
-  if (table_var->IsType<LoDTensor>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<LoDTensor>().type()),
-        ctx.device_context());
-  } else if (table_var->IsType<SelectedRows>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
-        ctx.device_context());
-  } else {
-    PADDLE_THROW("W should be LoDTensor or SelectedRows");
-  }
-}
 class LookupTableOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
  }
 };
@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
  }
 };

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
-static constexpr int64_t kNoPadding = -1;
+constexpr int64_t kNoPadding = -1;
-inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
-  auto it = std::find(rows.begin(), rows.end(), value);
-  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
-  return static_cast<size_t>(std::distance(rows.begin(), it));
-}
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
    }
    int64_t *ids;
@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
          memset(output + i * row_width, 0, row_width * sizeof(T));
        } else {
          PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = getIndex(table_t.rows(), ids[i]);
+          auto id_index = table_t.index(ids[i]);
          memcpy(output + i * row_width, table + id_index * row_width,
                 row_width * sizeof(T));
        }
@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
      auto *table_t = context.Input<SelectedRows>("W");
      table_dim = table_t->value().dims();
    } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
    }
    bool is_sparse = context.Attr<bool>("is_sparse");

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
  cublasOperation_t cuTransB =
      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const half h_alpha = static_cast<const half>(alpha);
+  float h_alpha = static_cast<float>(alpha);
-  const half h_beta = static_cast<const half>(beta);
+  float h_beta = static_cast<float>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
  // TODO(kexinzhao): add processing code for compute capability < 53 case
  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-      h_A, lda, &h_beta, h_C, N));
+#if CUDA_VERSION >= 9000
+  if (context.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_DEFAULT_MATH));
+  }
+#endif
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
 }
 template <>

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 #define EIGEN_USE_GPU
+#include <vector>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct ValueClip {
  HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
+    const T kThreshold = static_cast<T>(-64.);
    return x < kThreshold ? kThreshold : x;
  }
 };

--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
  }
 };
@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::PriorBoxOpKernel<double>);
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/prior_box_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+template <typename T>
+__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
+                            const int width, const int im_height,
+                            const int im_width, const int as_num,
+                            const T offset, const T step_width,
+                            const T step_height, const T* min_sizes,
+                            const T* max_sizes, const int min_num,
+                            bool is_clip) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (s < as_num) {
+        T ar = aspect_ratios[s];
+        bw = min_size * sqrt(ar) / 2.;
+        bh = min_size / sqrt(ar) / 2.;
+      } else {
+        T max_size = max_sizes[m];
+        bw = sqrt(min_size * max_size) / 2.;
+        bh = bw;
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+template <typename T>
+class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+    auto im_width = image->dims()[3];
+    auto im_height = image->dims()[2];
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(im_width) / width;
+      step_height = static_cast<T>(im_height) / height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+    int min_num = static_cast<int>(min_sizes.size());
+    int box_num = width * height * num_priors;
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+    framework::Tensor r;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
+    framework::Tensor min;
+    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
+    T* max_data = nullptr;
+    framework::Tensor max;
+    if (max_sizes.size() > 0) {
+      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
+      max_data = max.data<T>();
+    }
+    GenPriorBox<T><<<grid, block, 0, stream>>>(
+        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
+        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
+        max_data, min_num, clip);
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
+                        ops::PriorBoxOpCUDAKernel<double>);
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -51,7 +51,7 @@ struct ClipFunctor {
  }
 };
-template <typename Place, typename T>
+template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
        int idx = 0;
        for (size_t s = 0; s < min_sizes.size(); ++s) {
          auto min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
+          // priors with different aspect ratios
-          box_width = box_height = min_size / 2.;
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-          // xmin
+            float ar = aspect_ratios[r];
-          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+            box_width = min_size * sqrt(ar) / 2.;
-          // ymin
+            box_height = min_size / sqrt(ar) / 2.;
-          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-          idx++;
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            // xmin
            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
            idx++;
          }
+          if (max_sizes.size() > 0) {
-          // rest of priors
+            auto max_size = max_sizes[s];
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            // square prior with size sqrt(minSize * maxSize)
-            float ar = aspect_ratios[r];
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            // xmin
            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
            idx++;
          }

--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -39,10 +39,13 @@ class CreateBatchReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
    out->Reset(
        new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
  }

--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -99,10 +99,13 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
    auto* out = scope.FindVar(Output("Out"))
                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
    auto place_str = Attr<std::string>("place");
    platform::Place place;

--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -62,12 +62,15 @@ class CreateMultiPassReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
-    auto& out = detail::Ref(scope.FindVar(Output("Out")));
    int pass_num = Attr<int>("pass_num");
-    out.GetMutable<framework::ReaderHolder>()->Reset(
+    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
-        new MultiPassReader(underlying_reader.Get(), pass_num));
  }
 };

--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -80,10 +80,14 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope& scope,
               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                        ->Get<framework::ReaderHolder>();
-    auto& var = detail::Ref(scope.FindVar(Output("Out")));
+    out->Reset(
-    var.GetMutable<framework::ReaderHolder>()->Reset(
        new ShuffleReader(underlying_reader.Get(),
                          static_cast<size_t>(Attr<int>("buffer_size"))));
  }

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/string/printf.h"
@@ -34,6 +35,7 @@ namespace m = paddle::operators::math;
 // global for simplicity.
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
+int selected_port;
 void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
  p::CPUDeviceContext ctx(place);
@@ -128,14 +130,16 @@ void StartServerNet(bool is_sparse) {
  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
  attrs.insert({"Fanin", 1});
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
  attrs.insert({"OptimizeBlock", optimize_block});
  listen_and_serv_op =
      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
+  LOG(INFO) << "selected port before run " << selected_port;
  listen_and_serv_op->Run(scope, place);
+  LOG(INFO) << "server exit";
 }
 TEST(SendRecvOp, CPUDense) {
@@ -149,12 +153,19 @@ TEST(SendRecvOp, CPUDense) {
  scope.Var("RPC_CLIENT_VAR");
  f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  selected_port = static_cast<paddle::operators::ListenAndServOp *>(
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+                      listen_and_serv_op.get())
+                      ->GetSelectedPort();
+  LOG(INFO) << "selected port " << selected_port;
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
  auto send_op = f::OpRegistry::CreateOp(
      "send", {{"X", {"x1"}}},
      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  LOG(INFO) << "before run " << endpoint;
  send_op->Run(scope, place);
+  LOG(INFO) << "end run";
  auto in_var = scope.Var("x1");
  auto tensor = in_var->GetMutable<f::LoDTensor>();
@@ -167,6 +178,7 @@ TEST(SendRecvOp, CPUDense) {
  for (int64_t i = 0; i < target->numel(); ++i) {
    EXPECT_EQ(expected[i] * 2, actual[i]);
  }
+  LOG(INFO) << "before stop";
  listen_and_serv_op->Stop();
  server_thread.join();
  listen_and_serv_op.reset(nullptr);
@@ -182,8 +194,13 @@ TEST(SendRecvOp, CPUSparse) {
  InitSelectedRowsInScope(scope, place);
  scope.Var("RPC_CLIENT_VAR");
  f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  selected_port = static_cast<paddle::operators::ListenAndServOp *>(
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+                      listen_and_serv_op.get())
+                      ->GetSelectedPort();
+  LOG(INFO) << "selected port " << selected_port;
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
  auto send_op = f::OpRegistry::CreateOp(
      "send", {{"X", {"x1"}}},
      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);

--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
+    return framework::OpKernelType(data_type, ctx.device_context());
-        ctx.GetPlace());
  }
 };
@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Param", "(Tensor or SelectedRows) Input parameter");
    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("ParamOut",
+              "(Tensor or SelectedRows, same with Param) "
+              "Output parameter, should share the same memory with Param");
    AddComment(R"DOC(
 SGD operator

--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -23,60 +23,97 @@ namespace operators {
 template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (param_var->IsType<framework::LoDTensor>()) {
-    if (grad_var->IsType<framework::LoDTensor>()) {
+      const auto *param = ctx.Input<framework::Tensor>("Param");
-      param_out->mutable_data<T>(ctx.GetPlace());
+      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      // Actually, all tensors are LoDTensor except SelectedRows.
-      auto p = framework::EigenVector<T>::Flatten(*param);
+      if (grad_var->IsType<framework::LoDTensor>()) {
-      auto g = framework::EigenVector<T>::Flatten(*grad);
+        param_out->mutable_data<T>(ctx.GetPlace());
-      auto o = framework::EigenVector<T>::Flatten(*param_out);
+        const auto *grad = ctx.Input<framework::Tensor>("Grad");
-      auto* lr = learning_rate->data<T>();
+        auto p = framework::EigenVector<T>::Flatten(*param);
-      o = p - lr[0] * g;
+        auto g = framework::EigenVector<T>::Flatten(*grad);
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
+        auto o = framework::EigenVector<T>::Flatten(*param_out);
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+        auto *lr = learning_rate->data<T>();
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
+        o = p - lr[0] * g;
-      PADDLE_ENFORCE_EQ(param, param_out);
+      } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+        // This manual optimization brings difficulty to track data dependency.
+        // It's better to find a more elegant solution.
+        PADDLE_ENFORCE_EQ(param, param_out);
+        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+        // for distributed training, a sparse var may be empty,
+        // just skip updating.
+        if (grad->rows().size() == 0) {
+          return;
+        }
+        auto grad_height = grad->height();
+        auto out_dims = param_out->dims();
+        PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
+        auto &grad_value = grad->value();
+        auto &grad_rows = grad->rows();
+        size_t grad_row_numel = grad_value.numel() / grad_rows.size();
+        PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
+        auto *grad_data = grad_value.data<T>();
+        auto *out_data = param_out->data<T>();
+        auto *lr = learning_rate->data<T>();
+        for (size_t i = 0; i < grad_rows.size(); i++) {
+          PADDLE_ENFORCE(grad_rows[i] < grad_height,
+                         "Input rows index should less than height");
+          for (int64_t j = 0; j < grad_row_numel; j++) {
+            out_data[grad_rows[i] * grad_row_numel + j] -=
+                lr[0] * grad_data[i * grad_row_numel + j];
+          }
+        }
+      } else {
+        PADDLE_THROW("Unsupported Variable Type of Grad");
+      }
+    } else if (param_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
+                     "when param "
+                     "is SelectedRows, gradient should also be SelectedRows");
+      const auto &param = param_var->Get<framework::SelectedRows>();
+      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
+      const auto &grad = grad_var->Get<framework::SelectedRows>();
      // for distributed training, a sparse var may be empty,
      // just skip updating.
-      if (grad->rows().size() == 0) {
+      if (grad.rows().size() == 0) {
        return;
      }
-      auto in_height = grad->height();
+      size_t param_row_width = param.value().numel() / param.rows().size();
-      auto out_dims = param_out->dims();
+      size_t grad_row_width = grad.value().numel() / grad.rows().size();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
+                        "param_row should have the same size with grad_row");
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      const auto *lr = learning_rate->data<T>();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+      const auto *grad_data = grad.value().data<T>();
+      auto *out_data = param_out->mutable_value()->data<T>();
-      auto* in_data = in_value.data<T>();
+      for (size_t i = 0; i < grad.rows().size(); i++) {
-      auto* out_data = param_out->data<T>();
+        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
-      auto* lr = learning_rate->data<T>();
-      for (size_t i = 0; i < in_rows.size(); i++) {
-        PADDLE_ENFORCE(in_rows[i] < in_height,
                       "Input rows index should less than height");
-        for (int64_t j = 0; j < in_row_numel; j++) {
+        int64_t id_index = param.index(grad.rows()[i]);
-          out_data[in_rows[i] * in_row_numel + j] -=
+        for (int64_t j = 0; j < grad_row_width; j++) {
-              lr[0] * in_data[i * in_row_numel + j];
+          out_data[id_index * grad_row_width + j] -=
+              lr[0] * grad_data[i * grad_row_width + j];
        }
      }
    } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_THROW("Unsupported Variable Type of Parameter");
    }
  }
 };

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
+#include <string>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -20,6 +23,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 namespace paddle {
 namespace operators {
@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel {
    auto input_data_type =
        framework::ToDataType(ctx.Input<Tensor>("X")->type());
    if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                        "float16 can only be used when CUDNN is used");
+                     "float16 can only be used on GPU place");
    }
    std::string data_format = ctx.Attr<std::string>("data_format");
@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                                   library_);
  }
 };
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)

--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/float16.h"
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
 REGISTER_OP_CUDA_KERNEL(
-    softmax_grad,
+    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(softmax_grad,
+                        ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
--- a/paddle/fluid/platform/.clang-format
+++ b/paddle/fluid/platform/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/string/printf.h"
 #include <ostream>
 #include <sstream>
@@ -20,6 +19,7 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/string/printf.h"
 DECLARE_double(fraction_of_cpu_memory_to_use);

--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
    }
 #endif
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
    return desc_;
  }

--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr;
 CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
+   Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
+   you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
+   Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
+   distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+   See the License for the specific language governing permissions and
-limitations under the License. */
+   limitations under the License. */
 #pragma once
 #include <cublas_v2.h>
+#include <cuda.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 namespace paddle {
@@ -34,18 +35,18 @@ extern void *cublas_dso_handle;
 * note: default dynamic linked libs
 */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
-  struct DynLoad__##__name {                                        \
+  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                     \
+    template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                \
+    inline cublasStatus_t operator()(Args... args) {                         \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                         \
-      std::call_once(cublas_dso_flag,                               \
+      std::call_once(cublas_dso_flag, []() {                                 \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
+        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
-                     &cublas_dso_handle);                           \
+      });                                                                    \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);              \
-    }                                                               \
+    }                                                                        \
-  };                                                                \
+  };                                                                         \
  extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
@@ -70,6 +71,7 @@ extern void *cublas_dso_handle;
  __macro(cublasDgemm_v2);                \
  __macro(cublasHgemm);                   \
  __macro(cublasSgemmEx);                 \
+  __macro(cublasGemmEx);                  \
  __macro(cublasSgeam_v2);                \
  __macro(cublasDgeam_v2);                \
  __macro(cublasCreate_v2);               \
@@ -89,9 +91,15 @@ extern void *cublas_dso_handle;
  __macro(cublasSgetrfBatched);           \
  __macro(cublasSgetriBatched);           \
  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
+  __macro(cublasDgetriBatched);
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload

--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
  return cudnn_dso_handle != nullptr;
 }

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 namespace paddle {
@@ -30,19 +30,19 @@ extern bool HasCUDNN();
 #ifdef PADDLE_USE_DSO
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
-  struct DynLoad__##__name {                                       \
+  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                    \
+    template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);           \
-      std::call_once(cudnn_dso_flag,                               \
+      std::call_once(cudnn_dso_flag, []() {                                \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
+        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
-                     &cudnn_dso_handle);                           \
+      });                                                                  \
-      EnforceCUDNNLoaded(#__name);                                 \
+      EnforceCUDNNLoaded(#__name);                                         \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
-    }                                                              \
+    }                                                                      \
-  };                                                               \
+  };                                                                       \
  extern struct DynLoad__##__name __name
 #else
@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif

--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_WITH_CUPTI
 #include <cuda.h>
 #include <cupti.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 namespace paddle {
@@ -36,18 +37,18 @@ extern void *cupti_dso_handle;
 * note: default dynamic linked libs
 */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
-  struct DynLoad__##__name {                                       \
+  struct DynLoad__##__name {                                               \
-    template <typename... Args>                                    \
+    template <typename... Args>                                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);                  \
-      std::call_once(cupti_dso_flag,                               \
+      std::call_once(cupti_dso_flag, []() {                                \
-                     paddle::platform::dynload::GetCUPTIDsoHandle, \
+        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
-                     &cupti_dso_handle);                           \
+      });                                                                  \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
-    }                                                              \
+    }                                                                      \
-  };                                                               \
+  };                                                                       \
  extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \

--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <curand.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 namespace paddle {
@@ -25,18 +26,18 @@ namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
-  struct DynLoad__##__name {                                        \
+  struct DynLoad__##__name {                                                 \
-    template <typename... Args>                                     \
+    template <typename... Args>                                              \
-    curandStatus_t operator()(Args... args) {                       \
+    curandStatus_t operator()(Args... args) {                                \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
+      typedef curandStatus_t (*curandFunc)(Args...);                         \
-      std::call_once(curand_dso_flag,                               \
+      std::call_once(curand_dso_flag, []() {                                 \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
-                     &curand_dso_handle);                           \
+      });                                                                    \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
-    }                                                               \
+    }                                                                        \
-  };                                                                \
+  };                                                                         \
  extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include <dlfcn.h>
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
  return ret;
 }
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                               void** dso_handle,
+                                                int dynload_flags) {
-                                               int dynload_flags) {
  VLOG(3) << "Try to find library: " << dso_path
          << " from default system path.";
  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
 #if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
+  if (nullptr == dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    dso_handle =
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
      if (dso_path == "libcudnn.dylib") {
        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
                        "For instance, sudo tar -xzf "
@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
    }
  }
 #endif
+  return dso_handle;
 }
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
+                                               const std::string& dso_name,
-                                              void** dso_handle,
+                                               bool throw_on_error = true) {
-                                              bool throw_on_error = true) {
  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
+  void* dso_handle = nullptr;
  std::string dlPath = dso_name;
  if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
  } else {
    // search xxx.so from custom path
    dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
    // if not found, search from default path
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                   << dlerror() << ")";
      dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
    }
  }
  auto error_msg =
@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
      "using the DYLD_LIBRARY_PATH is impossible unless System "
      "Integrity Protection (SIP) is disabled.";
  if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
+  } else if (nullptr == dso_handle) {
    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
  }
+  return dso_handle;
 }
-void GetCublasDsoHandle(void** dso_handle) {
+void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
-void GetCUDNNDsoHandle(void** dso_handle) {
+void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
-                             false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
 }
-void GetCUPTIDsoHandle(void** dso_handle) {
+void* GetCUPTIDsoHandle() {
  std::string cupti_path = cupti_lib_path;
  if (!FLAGS_cupti_dir.empty()) {
    cupti_path = FLAGS_cupti_dir;
  }
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
 #endif
 }
-void GetCurandDsoHandle(void** dso_handle) {
+void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
 }
-void GetWarpCTCDsoHandle(void** dso_handle) {
+void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
 }
-void GetLapackDsoHandle(void** dso_handle) {
+void* GetLapackDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
 #endif
 }
-void GetNCCLDsoHandle(void** dso_handle) {
+void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
 #endif
 }

--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,55 +18,13 @@ namespace paddle {
 namespace platform {
 namespace dynload {
-/**
+void* GetCublasDsoHandle();
- * @brief    load the DSO of CUBLAS
+void* GetCUDNNDsoHandle();
- *
+void* GetCUPTIDsoHandle();
- * @param    **dso_handle   dso handler
+void* GetCurandDsoHandle();
- *
+void* GetWarpCTCDsoHandle();
- */
+void* GetLapackDsoHandle();
-void GetCublasDsoHandle(void** dso_handle);
+void* GetNCCLDsoHandle();
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCUDNNDsoHandle(void** dso_handle);
-void GetCUPTIDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of NVIDIA nccl
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetNCCLDsoHandle(void** dso_handle);
 }  // namespace dynload
 }  // namespace platform

--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,11 +25,6 @@ void *nccl_dso_handle;
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <dlfcn.h>
 #include <nccl.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/call_once.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
@@ -28,18 +29,19 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 #ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
-  struct DynLoad__##__name {                                     \
+  struct DynLoad__##__name {                                             \
-    template <typename... Args>                                  \
+    template <typename... Args>                                          \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);          \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
+      std::call_once(nccl_dso_flag, []() {                               \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+      });                                                                \
-    }                                                            \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
-  };                                                             \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
  extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \

--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,9 +15,10 @@ limitations under the License. */
 #pragma once
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
-#include "ctc.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "warpctc/include/ctc.h"
 namespace paddle {
 namespace platform {
@@ -31,18 +32,18 @@ extern void* warpctc_dso_handle;
 * (for each function) to dynamic load warpctc routine
 * via operator overloading.
 */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
-  struct DynLoad__##__name {                                         \
+  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                      \
+    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(warpctc_dso_flag,                               \
+      std::call_once(warpctc_dso_flag, []() {                                  \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
+        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
-                     &warpctc_dso_handle);                           \
+      });                                                                      \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
-    }                                                                \
+    }                                                                          \
-  };                                                                 \
+  };                                                                           \
  extern DynLoad__##__name __name
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -16,35 +16,35 @@ limitations under the License. */
 #include <dlfcn.h>     // for dladdr
 #include <execinfo.h>  // for backtrace
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif               // __GNUC__
+#ifdef PADDLE_WITH_CUDA
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_CUDA
 #include <iomanip>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-#include <glog/logging.h>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
 #endif
 namespace paddle {
@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
  }
 }
-#endif  // PADDLE_ONLY_CPU
+#endif  // PADDLE_WITH_CUDA
 template <typename T>
 inline void throw_on_error(T e) {

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(
@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GE(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(
@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(
@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
  try {
    int* a = nullptr;
    PADDLE_ENFORCE_NOT_NULL(a);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stdint.h>
+#include <limits>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(float(*this));
+    return static_cast<int8_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(float(*this));
+    return static_cast<uint8_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(float(*this));
+    return static_cast<int16_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(float(*this));
+    return static_cast<uint16_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(float(*this));
+    return static_cast<int32_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(float(*this));
+    return static_cast<uint32_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(float(*this));
+    return static_cast<int64_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(float(*this));
+    return static_cast<uint64_t>(static_cast<float>(*this));
  }
  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(float(*this));
+    return static_cast<double>(static_cast<float>(*this));
  }
 private:
@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hadd(a, b);
 #else
-  float res = float(float16(a)) + float(float16(b));
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
  return half(float16(res));
 #endif
 }
@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hsub(a, b);
 #else
-  float res = float(float16(a)) - float(float16(b));
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
  return half(float16(res));
 #endif
 }
@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hmul(a, b);
 #else
-  float res = float(float16(a)) * float(float16(b));
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
  return half(float16(res));
 #endif
 }
@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
  float denom = __half2float(b);
  return __float2half(num / denom);
 #else
-  float res = float(float16(a)) / float(float16(b));
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
  return half(float16(res));
 #endif
 }
@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hneg(a);
 #else
-  float res = -float(float16(a));
+  float res = -static_cast<float>(float16(a));
  return half(float16(res));
 #endif
 }
-DEVICE inline half& operator+=(half& a, const half& b) {
+DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
  a = a + b;
  return a;
 }
-DEVICE inline half& operator-=(half& a, const half& b) {
+DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
  a = a - b;
  return a;
 }
-DEVICE inline half& operator*=(half& a, const half& b) {
+DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
  a = a * b;
  return a;
 }
-DEVICE inline half& operator/=(half& a, const half& b) {
+DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
  a = a / b;
  return a;
 }
@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __heq(a, b);
 #else
-  return float(float16(a)) == float(float16(b));
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
 #endif
 }
@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hne(a, b);
 #else
-  return float(float16(a)) != float(float16(b));
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
 #endif
 }
@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(a, b);
 #else
-  return float(float16(a)) < float(float16(b));
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
 #endif
 }
@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hle(a, b);
 #else
-  return float(float16(a)) <= float(float16(b));
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
 #endif
 }
@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(a, b);
 #else
-  return float(float16(a)) > float(float16(b));
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
 #endif
 }
@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hge(a, b);
 #else
-  return float(float16(a)) >= float(float16(b));
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
 #endif
 }
@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return float16(__hadd(half(a), half(b)));
 #else
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 #endif
 }
@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return float16(__hsub(half(a), half(b)));
 #else
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 #endif
 }
@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return float16(__hmul(half(a), half(b)));
 #else
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 #endif
 }
@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
  float denom = __half2float(half(b));
  return float16(num / denom);
 #else
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 #endif
 }
@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
 #endif
 }
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
  a = a + b;
  return a;
 }
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
  a = a - b;
  return a;
 }
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
  a = a * b;
  return a;
 }
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
  a = a / b;
  return a;
 }
@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __heq(half(a), half(b));
 #else
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 #endif
 }
@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hne(half(a), half(b));
 #else
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 #endif
 }
@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hlt(half(a), half(b));
 #else
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 #endif
 }
@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hle(half(a), half(b));
 #else
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 #endif
 }
@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hgt(half(a), half(b));
 #else
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 #endif
 }
@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hge(half(a), half(b));
 #else
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 #endif
 }
@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) {
  return res;
 }
-inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
  a = a + b;
  return a;
 }
-inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
  a = a - b;
  return a;
 }
-inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
  a = a * b;
  return a;
 }
-inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
  a = a / b;
  return a;
 }
@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) {
 // Arithmetic operators for float16, software emulated on other CPU
 #else
 inline float16 operator+(const float16& a, const float16& b) {
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 }
 inline float16 operator-(const float16& a, const float16& b) {
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 }
 inline float16 operator*(const float16& a, const float16& b) {
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 }
 inline float16 operator/(const float16& a, const float16& b) {
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 }
 inline float16 operator-(const float16& a) {
@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) {
  return res;
 }
-inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
-  a = float16(float(a) + float(b));
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
  return a;
 }
-inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
-  a = float16(float(a) - float(b));
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
  return a;
 }
-inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
-  a = float16(float(a) * float(b));
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
  return a;
 }
-inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
-  a = float16(float(a) / float(b));
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
  return a;
 }
 inline bool operator==(const float16& a, const float16& b) {
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 }
 inline bool operator!=(const float16& a, const float16& b) {
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 }
 inline bool operator<(const float16& a, const float16& b) {
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 }
 inline bool operator<=(const float16& a, const float16& b) {
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 }
 inline bool operator>(const float16& a, const float16& b) {
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 }
 inline bool operator>=(const float16& a, const float16& b) {
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 }
 #endif
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
  return __hisnan(half(a));
@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> {
      is_standard_layout<paddle::platform::float16>::value;
 };
+template <>
+struct numeric_limits<paddle::platform::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+  static paddle::platform::float16(min)() {
+    return paddle::platform::raw_uint16_to_float16(0x400);
+  }
+  static paddle::platform::float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  static paddle::platform::float16(max)() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  static paddle::platform::float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  static paddle::platform::float16 round_error() {
+    return paddle::platform::float16(0.5);
+  }
+  static paddle::platform::float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  static paddle::platform::float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 denorm_min() {
+    return paddle::platform::raw_uint16_to_float16(0x1);
+  }
+};
 }  // namespace std
 namespace Eigen {
+using float16 = paddle::platform::float16;
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
 namespace numext {
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
+HOSTDEVICE inline bool(isnan)(const float16& a) {
-    const paddle::platform::float16& a) {
  return (paddle::platform::isnan)(a);
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
+HOSTDEVICE inline bool(isinf)(const float16& a) {
-    const paddle::platform::float16& a) {
  return (paddle::platform::isinf)(a);
 }
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
-    const paddle::platform::float16& a) {
  return (paddle::platform::isfinite)(a);
 }
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
 }  // namespace numext
 }  // namespace Eigen
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -14,8 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
-#include "gflags/gflags.h"
+#include <algorithm>
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
-void GpuMemoryUsage(size_t &available, size_t &total) {
+void GpuMemoryUsage(size_t *available, size_t *total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+  PADDLE_ENFORCE(cudaMemGetInfo(available, total),
                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
  size_t total = 0;
  size_t available = 0;
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
  // Reserve the rest for page tables, etc.
  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
  size_t total = 0;
  size_t available = 0;
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
           << total / 1024 / 1024 << "M";
  size_t reserving = static_cast<size_t>(0.05 * total);

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,10 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
-//! Environment variable: fraction of GPU memory to use on each device.
-const std::string kEnvFractionGpuMemoryToUse =
-    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 //! Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t &available, size_t &total);
+void GpuMemoryUsage(size_t *available, size_t *total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();

--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <iostream>
+#include <vector>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"

--- a/paddle/fluid/pybind/.clang-format
+++ b/paddle/fluid/pybind/.clang-format
---
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -15,4 +15,6 @@ if(WITH_PYTHON)
      target_link_libraries(paddle_pybind rt)
    endif(NOT APPLE AND NOT ANDROID)
  endif(WITH_AMD_GPU)
+  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "const_value.h"
+#include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/framework/operator.h"
 namespace paddle {
 namespace pybind {
-void BindConstValue(pybind11::module& m) {
+void BindConstValue(pybind11::module* m) {
-  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m->def("kTempVarName", [] { return framework::kTempVarName; });
-  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+  m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <Python.h>
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
-namespace py = pybind11;
 namespace paddle {
 namespace pybind {
-extern void BindConstValue(pybind11::module& m);
+void BindConstValue(pybind11::module* m);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
-void BindException(pybind11::module& m) {
+void BindException(pybind11::module* m) {
-  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+  static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
  pybind11::register_exception_translator([](std::exception_ptr p) {
    try {
      if (p) std::rethrow_exception(p);
@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) {
    }
  });
-  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+  m->def("__unittest_throw_exception__",
+         [] { PADDLE_THROW("test exception"); });
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <Python.h>
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 namespace paddle {
 namespace pybind {
-extern void BindException(pybind11::module& m);
+void BindException(pybind11::module* m);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/pybind/protobuf.h"
 #include <deque>
 #include <iostream>
+#include <string>
+#include <tuple>
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>>
 namespace paddle {
 namespace pybind {
-using namespace paddle::framework;  // NOLINT
+namespace pd = paddle::framework;
 template <typename T>
-static py::bytes SerializeMessage(T &self) {
+static pybind11::bytes SerializeMessage(
+    T &self) {  // NOLINT due to pybind11 convention.
  // Check IsInitialized in Python
  std::string retv;
  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) {
 }
 // Bind Methods
-void BindProgramDesc(py::module &m) {
+void BindProgramDesc(pybind11::module *m) {
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+  pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
-      .def(py::init<>())
+      .def(pybind11::init<>())
      .def("__init__",
-           [](ProgramDesc &self, const ProgramDesc &other) {
+           [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
-             new (&self) ProgramDesc(other);
+             new (&self) pd::ProgramDesc(other);
           })
      .def("__init__",
-           [](ProgramDesc &self, const py::bytes &binary_str) {
+           [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
             std::string str(binary_str);
-             new (&self) ProgramDesc(str);
+             new (&self) pd::ProgramDesc(str);
           })
-      .def("append_block", &ProgramDesc::AppendBlock,
+      .def("append_block", &pd::ProgramDesc::AppendBlock,
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
      .def("append_backward",
-           [](ProgramDesc &program_desc, const VarDesc &target,
+           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
              const std::unordered_set<std::string> &no_grad_vars) {
-             ParamGradInfoMap param_grad_map =
+             pd::ParamGradInfoMap param_grad_map =
                 AppendBackward(program_desc, target, no_grad_vars);
             std::unordered_map<
                 std::string, std::tuple<std::string /* grad_var_name */,
@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) {
             }
             return retv;
           })
-      .def("block", &ProgramDesc::MutableBlock,
+      .def("block", &pd::ProgramDesc::MutableBlock,
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("num_blocks", &ProgramDesc::Size)
+      .def("num_blocks", &pd::ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
      .def("parse_from_string",
-           [](ProgramDesc &program_desc, const std::string &data) {
+           [](pd::ProgramDesc &program_desc, const std::string &data) {
-             proto::ProgramDesc *desc = program_desc.Proto();
+             pd::proto::ProgramDesc *desc = program_desc.Proto();
             PADDLE_ENFORCE(desc->ParseFromString(data),
                            "Fail to parse ProgramDesc from string. This could "
                            "be a bug of Paddle.");
           });
 }
-void BindBlockDesc(py::module &m) {
+void BindBlockDesc(pybind11::module *m) {
-  py::class_<BlockDesc>(m, "BlockDesc", "")
+  pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("id", &pd::BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def_property_readonly("parent", &pd::BlockDesc::Parent)
-      .def("get_forward_block_idx", &BlockDesc::ForwardBlockID)
+      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &BlockDesc::SetForwardBlockID)
+      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
-      .def("append_op", &BlockDesc::AppendOp,
+      .def("append_op", &pd::BlockDesc::AppendOp,
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("prepend_op", &BlockDesc::PrependOp,
+      .def("prepend_op", &pd::BlockDesc::PrependOp,
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("insert_op", &BlockDesc::InsertOp,
+      .def("insert_op", &pd::BlockDesc::InsertOp,
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("remove_op", &BlockDesc::RemoveOp)
+      .def("remove_op", &pd::BlockDesc::RemoveOp)
      .def("var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.Var(name);
           },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
      .def("has_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.HasVar(name);
           },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
      .def("rename_var",
-           [](BlockDesc &self, const py::bytes &byte_name,
+           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
-              const py::bytes &byte_name_new) {
+              const pybind11::bytes &byte_name_new) {
             std::string name = byte_name;
             std::string new_name = byte_name_new;
             self.RenameVar(name, new_name);
           })
      .def("has_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.HasVarRecursive(name);
           })
      .def("find_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVar(name);
           },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
      .def("find_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
             std::string name = byte_name;
             return self.FindVarRecursive(name);
           },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("remove_var",
-      .def("op_size", &BlockDesc::OpSize)
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
-      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+             std::string name = byte_name;
-      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+             return self.RemoveVar(name);
+           },
+           pybind11::return_value_policy::reference)
+      .def("all_vars", &pd::BlockDesc::AllVars,
+           pybind11::return_value_policy::reference)
+      .def("op_size", &pd::BlockDesc::OpSize)
+      .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
 }
-void BindVarDsec(py::module &m) {
+void BindVarDsec(pybind11::module *m) {
-  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
  var_desc
      .def("name",
-           [](VarDesc &self) {
+           [](pd::VarDesc &self) {
-             py::bytes name = self.Name();
+             pybind11::bytes name = self.Name();
             return name;
           },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("set_name", &VarDesc::SetName)
+      .def("set_name", &pd::VarDesc::SetName)
-      .def("set_shape", &VarDesc::SetShape)
+      .def("set_shape", &pd::VarDesc::SetShape)
-      .def("set_shapes", &VarDesc::SetShapes)
+      .def("set_shapes", &pd::VarDesc::SetShapes)
-      .def("set_dtype", &VarDesc::SetDataType)
+      .def("set_dtype", &pd::VarDesc::SetDataType)
-      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
-      .def("set_capacity", &VarDesc::SetCapacity)
+      .def("set_capacity", &pd::VarDesc::SetCapacity)
-      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shape", &pd::VarDesc::GetShape,
-      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("shapes", &pd::VarDesc::GetShapes,
-      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("dtype", &pd::VarDesc::GetDataType,
-      .def("lod_levels", &VarDesc::GetLoDLevels,
+           pybind11::return_value_policy::reference)
-           py::return_value_policy::reference)
+      .def("dtypes", &pd::VarDesc::GetDataTypes,
-      .def("set_lod_level", &VarDesc::SetLoDLevel)
+           pybind11::return_value_policy::reference)
-      .def("set_lod_levels", &VarDesc::SetLoDLevels)
+      .def("lod_level", &pd::VarDesc::GetLoDLevel)
-      .def("type", &VarDesc::GetType)
+      .def("lod_levels", &pd::VarDesc::GetLoDLevels,
-      .def("set_type", &VarDesc::SetType)
+           pybind11::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
-      .def("persistable", &VarDesc::Persistable)
+      .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
-      .def("set_persistable", &VarDesc::SetPersistable);
+      .def("type", &pd::VarDesc::GetType)
+      .def("set_type", &pd::VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<pd::VarDesc>)
+      .def("persistable", &pd::VarDesc::Persistable)
+      .def("set_persistable", &pd::VarDesc::SetPersistable);
-  py::enum_<proto::VarType::Type>(var_desc, "VarType", "")
+  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
-      .value("BOOL", proto::VarType::BOOL)
+      .value("BOOL", pd::proto::VarType::BOOL)
-      .value("INT16", proto::VarType::INT16)
+      .value("INT16", pd::proto::VarType::INT16)
-      .value("INT32", proto::VarType::INT32)
+      .value("INT32", pd::proto::VarType::INT32)
-      .value("INT64", proto::VarType::INT64)
+      .value("INT64", pd::proto::VarType::INT64)
-      .value("FP16", proto::VarType::FP16)
+      .value("FP16", pd::proto::VarType::FP16)
-      .value("FP32", proto::VarType::FP32)
+      .value("FP32", pd::proto::VarType::FP32)
-      .value("FP64", proto::VarType::FP64)
+      .value("FP64", pd::proto::VarType::FP64)
-      .value("LOD_TENSOR", proto::VarType::LOD_TENSOR)
+      .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
-      .value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS)
+      .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH)
+      .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
-      .value("FETCH_LIST", proto::VarType::FETCH_LIST)
+      .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
-      .value("STEP_SCOPES", proto::VarType::STEP_SCOPES)
+      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE)
+      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY)
+      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", proto::VarType::CHANNEL)
+      .value("CHANNEL", pd::proto::VarType::CHANNEL)
-      .value("PLACE_LIST", proto::VarType::PLACE_LIST)
+      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
-      .value("READER", proto::VarType::READER)
+      .value("READER", pd::proto::VarType::READER)
-      .value("RAW", proto::VarType::RAW);
+      .value("RAW", pd::proto::VarType::RAW);
 }
-void BindOpDesc(py::module &m) {
+void BindOpDesc(pybind11::module *m) {
-  py::enum_<proto::AttrType>(m, "AttrType", "")
+  pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
-      .value("INT", proto::AttrType::INT)
+      .value("INT", pd::proto::AttrType::INT)
-      .value("INTS", proto::AttrType::INTS)
+      .value("INTS", pd::proto::AttrType::INTS)
-      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOAT", pd::proto::AttrType::FLOAT)
-      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("FLOATS", pd::proto::AttrType::FLOATS)
-      .value("STRING", proto::AttrType::STRING)
+      .value("STRING", pd::proto::AttrType::STRING)
-      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("STRINGS", pd::proto::AttrType::STRINGS)
-      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOL", pd::proto::AttrType::BOOLEAN)
-      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
-      .value("BLOCK", proto::AttrType::BLOCK);
+      .value("BLOCK", pd::proto::AttrType::BLOCK);
-  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
  op_desc
-      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
-      .def("copy_from", &OpDesc::CopyFrom)
+      .def("copy_from", &pd::OpDesc::CopyFrom)
-      .def("type", &OpDesc::Type)
+      .def("type", &pd::OpDesc::Type)
-      .def("set_type", &OpDesc::SetType)
+      .def("set_type", &pd::OpDesc::SetType)
-      .def("input", &OpDesc::Input)
+      .def("input", &pd::OpDesc::Input)
-      .def("input_names", &OpDesc::InputNames)
+      .def("input_names", &pd::OpDesc::InputNames)
-      .def("output", &OpDesc::Output)
+      .def("output", &pd::OpDesc::Output)
-      .def("output_names", &OpDesc::OutputNames)
+      .def("output_names", &pd::OpDesc::OutputNames)
-      .def("set_input", &OpDesc::SetInput)
+      .def("set_input", &pd::OpDesc::SetInput)
-      .def("set_output", &OpDesc::SetOutput)
+      .def("set_output", &pd::OpDesc::SetOutput)
-      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
-      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
-      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_input", &pd::OpDesc::RenameInput)
-      .def("rename_output", &OpDesc::RenameOutput)
+      .def("rename_output", &pd::OpDesc::RenameOutput)
-      .def("has_attr", &OpDesc::HasAttr)
+      .def("has_attr", &pd::OpDesc::HasAttr)
-      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_type", &pd::OpDesc::GetAttrType)
-      .def("attr_names", &OpDesc::AttrNames)
+      .def("attr_names", &pd::OpDesc::AttrNames)
-      .def("set_attr", &OpDesc::SetAttr)
+      .def("set_attr", &pd::OpDesc::SetAttr)
-      .def("attr", &OpDesc::GetAttr)
+      .def("attr", &pd::OpDesc::GetAttr)
-      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
      .def("set_serialized_attr",
-           [](OpDesc &self, const std::string &name,
+           [](pd::OpDesc &self, const std::string &name,
-              const py::bytes &seriralized) {
+              const pybind11::bytes &seriralized) {
             std::string ser(seriralized);
             self.SetAttr(name, ser);
           })
-      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("block_attr", &pd::OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("check_attrs", &pd::OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_shape", &pd::OpDesc::InferShape)
-      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("infer_var_type", &pd::OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>)
+      .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
-      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+      .def("block", &pd::OpDesc::Block,
+           pybind11::return_value_policy::reference);
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <Python.h>
 #include <fstream>
 #include <vector>
 #include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-namespace py = pybind11;
 namespace paddle {
 namespace pybind {
-void BindProgramDesc(py::module& m);
+void BindProgramDesc(pybind11::module* m);
-void BindBlockDesc(py::module& m);
+void BindBlockDesc(pybind11::module* m);
-void BindVarDsec(py::module& m);
+void BindVarDsec(pybind11::module* m);
-void BindOpDesc(py::module& m);
+void BindOpDesc(pybind11::module* m);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <map>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/pybind/protobuf.h"
-#include <mutex>  // for call_once
-#include <unordered_map>
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -32,7 +38,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cond_op.h"
 #include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) {
  // not cause namespace pollution.
  using namespace paddle::framework;  // NOLINT
-  BindException(m);
+  BindException(&m);
  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer(
@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) {
           [](Tensor &self, paddle::platform::CUDAPlace &place) {
             self.mutable_data<int>(place);
           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
      .def("set", PyCPUTensorSetFromArray<float>)
      .def("set", PyCPUTensorSetFromArray<int>)
      .def("set", PyCPUTensorSetFromArray<double>)
@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) {
      .def("set", PyCUDATensorSetFromArray<int64_t>)
      .def("set", PyCUDATensorSetFromArray<bool>)
      .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
 #endif
      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
      .def("set_float_element", TensorSetElement<float>)
@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle.
 #else
                    return new paddle::platform::CUDADeviceContext(place);
 #endif
-                  });
+                  })
+          .def_static("create",
+                [](paddle::platform::CUDAPinnedPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                  PADDLE_THROW(
+                        "CUDAPinnedPlace is not supported in CPU device.");
+#else
+                  return new paddle::platform::CUDAPinnedDeviceContext(place);
+#endif
+                });;
 // clang-format on
 #ifdef PADDLE_WITH_CUDA
  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle.
      .def(py::init<>())
      .def("__str__", string::to_string<const platform::CPUPlace &>);
+  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
  py::class_<platform::Place>(m, "Place")
      .def(py::init<>())
      .def("set_place",
@@ -339,7 +372,11 @@ All parameter, weight, gradient are variables in Paddle.
      .def("set_place",
           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
             self = gpu_place;
-           });
+           })
+      .def("set_place", [](platform::Place &self,
+                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
+        self = cuda_pinned_place;
+      });
  py::class_<OperatorBase>(m, "Operator")
      .def_static("create",
@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPinnedPlace &place) {
+             self.Run(scope, place);
+           })
      .def("type",
           [](const OperatorBase &op) -> std::string { return op.Type(); })
      .def("outputs",
@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("set_feed_variable", framework::SetFeedVariable);
  m.def("get_fetch_variable", framework::GetFetchVariable);
-  BindProgramDesc(m);
+  BindProgramDesc(&m);
-  BindBlockDesc(m);
+  BindBlockDesc(&m);
-  BindVarDsec(m);
+  BindVarDsec(&m);
-  BindOpDesc(m);
+  BindOpDesc(&m);
-  BindConstValue(m);
+  BindConstValue(&m);
  py::class_<framework::LoDRankTable>(m, "LodRankTable")
      .def("items", [](framework::LoDRankTable &table) {
@@ -511,7 +553,7 @@ All parameter, weight, gradient are variables in Paddle.
           })
      .def("run", &ParallelExecutor::Run);
-  BindRecordIOWriter(m);
+  BindRecordIOWriter(&m);
  return m.ptr();
 }
 }  // namespace pybind

--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 #include "paddle/fluid/pybind/recordio.h"
 #include <fstream>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/recordio/writer.h"
 namespace paddle {
 namespace pybind {
+namespace {
 class RecordIOWriter {
 public:
  RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
@@ -49,8 +55,10 @@ class RecordIOWriter {
  recordio::Writer writer_;
 };
-void BindRecordIOWriter(py::module& m) {
+}  // namespace
-  py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
+void BindRecordIOWriter(py::module* m) {
+  py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
  py::enum_<recordio::Compressor>(writer, "Compressor", "")
      .value("Snappy", recordio::Compressor::kSnappy)
      .value("NoCompress", recordio::Compressor::kNoCompress);

--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
@@ -21,6 +21,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
-extern void BindRecordIOWriter(py::module& m);
+void BindRecordIOWriter(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <Python.h>
 #include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -21,12 +24,8 @@ limitations under the License. */
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
-namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 namespace details {
 template <bool less, size_t I, typename... ARGS>
@@ -34,16 +33,16 @@ struct CastToPyBufferImpl;
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
    PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
+    return pybind11::buffer_info();
  }
 };
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
      auto dim_vec = framework::vectorize(tensor.dims());
      std::vector<size_t> dims_outside;
@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
      if (std::type_index(typeid(CUR_TYPE)) ==
          std::type_index(typeid(platform::float16))) {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+        return pybind11::buffer_info(
-                               "e", /* np.dtype('e') == np.float16 */
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               (size_t)framework::arity(dst_tensor.dims()),
+            "e", /* np.dtype('e') == np.float16 */
-                               dims_outside, strides);
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
      } else {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+        return pybind11::buffer_info(
-                               py::format_descriptor<CUR_TYPE>::format(),
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               (size_t)framework::arity(dst_tensor.dims()),
+            pybind11::format_descriptor<CUR_TYPE>::format(),
-                               dims_outside, strides);
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
      }
    } else {
      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 }  // namespace details
-inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
  auto buffer_info =
      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                  platform::float16>()(tensor);
@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 template <typename T>
-T TensorGetElement(framework::Tensor &self, size_t offset) {
+T TensorGetElement(const framework::Tensor &self, size_t offset) {
  if (platform::is_cpu_place(self.place())) {
    return self.data<T>()[offset];
  } else {
@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
 // TODO(dzhwinter) : fix the redundent Tensor allocate and free
 template <typename T>
-void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self.place())) {
+  if (platform::is_gpu_place(self->place())) {
    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
    dst->data<T>()[offset] = elem;
-    framework::TensorCopy(*dst.get(), self.place(), &self);
+    framework::TensorCopy(*dst.get(), self->place(), self);
-  } else if (platform::is_cpu_place(self.place())) {
+  } else if (platform::is_cpu_place(self->place())) {
-    self.data<T>()[offset] = elem;
+    self->data<T>()[offset] = elem;
  }
 }
 template <typename T>
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
+    framework::Tensor *self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
-    paddle::platform::CPUPlace &place) {
+        array,
+    paddle::platform::CPUPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
  }
-  self.Resize(framework::make_ddim(dims));
+  self->Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  auto *dst = self->mutable_data<T>(place);
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
+    framework::Tensor *self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    pybind11::array_t<uint16_t,
-    paddle::platform::CPUPlace &place) {
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
  }
-  self.Resize(framework::make_ddim(dims));
+  self->Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  auto *dst = self->mutable_data<platform::float16>(place);
  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
 }
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
+    framework::Tensor *self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
-    paddle::platform::CUDAPlace &place) {
+        array,
+    paddle::platform::CUDAPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
  }
-  self.Resize(framework::make_ddim(dims));
+  self->Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  auto *dst = self->mutable_data<T>(place);
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray(
 }
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
+    framework::Tensor *self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
+    pybind11::array_t<uint16_t,
-    paddle::platform::CUDAPlace &place) {
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
  for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
  }
-  self.Resize(framework::make_ddim(dims));
+  self->Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  auto *dst = self->mutable_data<platform::float16>(place);
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto dev_ctx =
@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray(
                                   sizeof(uint16_t) * array.size(),
                                   cudaMemcpyHostToDevice, dev_ctx->stream());
 }
+template <typename T>
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
 #endif
 }  // namespace pybind

--- a/paddle/fluid/pybind/tensor_py_test.cc
+++ b/paddle/fluid/pybind/tensor_py_test.cc
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/pybind/tensor_py.h"
+#include <iostream>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+TEST(TensorPy, CastToPyBufferImpl) {
+  typedef int ElemType;
+  paddle::framework::Tensor t;
+  auto d = paddle::framework::make_ddim({1, 2, 3});
+  int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
+  for (int i = 0; i < paddle::framework::product(d); ++i) {
+    p[i] = i;
+  }
+  pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
+  EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
+  EXPECT_EQ(bi.ndim, static_cast<size_t>(3));  // 3-dimensional as d.
+  EXPECT_EQ(bi.shape.size(), 3U);              // as Dim d.
+  EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
+  EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
+  EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
+  EXPECT_EQ(bi.strides.size(), 3U);  // 3-dimensional as d.
+  EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
+  EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
+}
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,11 +14,13 @@
 #include "paddle/fluid/recordio/chunk.h"
+#include <algorithm>
 #include <memory>
 #include <sstream>
 #include "paddle/fluid/platform/enforce.h"
-#include "snappystream.hpp"
+#include "snappy_stream/include/snappystream.hpp"
-#include "zlib.h"
+#include "zlib/include/zlib.h"
 namespace paddle {
 namespace recordio {
@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
 * Copy stream in to another stream
 */
 static void PipeStream(std::istream& in, std::ostream& os) {
-  ReadStreamByBuf(
+  ReadStreamByBuf(in, 0,
-      in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
+                  [&os](const char* buf, size_t len) { os.write(buf, len); });
 }
 /**
@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
 static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
  uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
  ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
-    crc = static_cast<uint32_t>(crc32(
+    crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
-        crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
+                                      static_cast<uInt>(len)));
  });
  return crc;
 }

--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -24,7 +24,7 @@ namespace recordio {
 // A Chunk contains the Header and optionally compressed records.
 class Chunk {
-public:
+ public:
  Chunk() : num_bytes_(0) {}
  void Add(const std::string& buf) {
    num_bytes_ += buf.size();
@@ -46,7 +46,7 @@ public:
  bool Empty() const { return records_.empty(); }
-private:
+ private:
  std::vector<std::string> records_;
  // sum of record lengths in bytes.
  size_t num_bytes_;

--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -18,29 +18,27 @@
 #include "gtest/gtest.h"
-using namespace paddle::recordio;
 TEST(Chunk, SaveLoad) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
  ch.Add(std::string("12345", 6));
  ch.Add(std::string("123", 4));
  std::stringstream ss;
-  ch.Write(ss, Compressor::kNoCompress);
+  ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
  ss.seekg(0);
  ch.Parse(ss);
  ASSERT_EQ(ch.NumBytes(), 10U);
 }
 TEST(Chunk, Compressor) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
  ch.Add(std::string("12345", 6));
  ch.Add(std::string("123", 4));
  ch.Add(std::string("123", 4));
  ch.Add(std::string("123", 4));
  std::stringstream ss;
-  ch.Write(ss, Compressor::kSnappy);
+  ch.Write(ss, paddle::recordio::Compressor::kSnappy);
  std::stringstream ss2;
-  ch.Write(ss2, Compressor::kNoCompress);
+  ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
  ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
  ch.Clear();

--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
 // Header is the metadata of Chunk
 class Header {
-public:
+ public:
  Header();
  Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
@@ -51,7 +51,7 @@ public:
  Compressor CompressType() const { return compressor_; }
  uint32_t CompressSize() const { return compress_size_; }
-private:
+ private:
  uint32_t num_records_;
  uint32_t checksum_;
  Compressor compressor_;

--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
@@ -18,14 +18,12 @@
 #include "gtest/gtest.h"
-using namespace paddle::recordio;
 TEST(Recordio, ChunkHead) {
-  Header hdr(0, 1, Compressor::kGzip, 3);
+  paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
  std::stringstream ss;
  hdr.Write(ss);
  ss.seekg(0, std::ios::beg);
-  Header hdr2;
+  paddle::recordio::Header hdr2;
  hdr2.Parse(ss);
  EXPECT_TRUE(hdr == hdr2);
 }
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -13,10 +13,14 @@
 // limitations under the License.
 #include "paddle/fluid/recordio/scanner.h"
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace recordio {
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
    : stream_(std::move(stream)) {
  Reset();

--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -16,12 +16,15 @@
 #include <fstream>
 #include <memory>
+#include <string>
 #include "paddle/fluid/recordio/chunk.h"
 namespace paddle {
 namespace recordio {
 class Scanner {
-public:
+ public:
  explicit Scanner(std::unique_ptr<std::istream>&& stream);
  explicit Scanner(const std::string& filename);
@@ -32,7 +35,7 @@ public:
  bool HasNext() const;
-private:
+ private:
  std::unique_ptr<std::istream> stream_;
  Chunk cur_chunk_;
  size_t offset_;

--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
@@ -12,9 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/recordio/writer.h"
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
 namespace recordio {
 void Writer::Write(const std::string& record) {
  cur_chunk_.Add(record);
  if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {

--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
@@ -11,16 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include <string>
 #include "paddle/fluid/recordio/chunk.h"
 namespace paddle {
 namespace recordio {
 class Writer {
-public:
+ public:
-  Writer(std::ostream* sout,
+  Writer(std::ostream* sout, Compressor compressor,
-         Compressor compressor,
         size_t max_num_records_in_chunk = 1000)
      : stream_(*sout),
        max_num_records_in_chunk_(max_num_records_in_chunk),
@@ -32,7 +33,7 @@ public:
  ~Writer();
-private:
+ private:
  std::ostream& stream_;
  size_t max_num_records_in_chunk_;
  Chunk cur_chunk_;

--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "gtest/gtest.h"
 #include <sstream>
+#include <string>
+#include "gtest/gtest.h"
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
@@ -66,4 +67,4 @@ TEST(WriterScanner, TinyChunk) {
    ASSERT_EQ(scanner.Next(), "DEFG");
    ASSERT_FALSE(scanner.HasNext());
  }
 }
\ No newline at end of file
--- a/paddle/fluid/string/.clang-format
+++ b/paddle/fluid/string/.clang-format
-../framework/.clang-format
\ No newline at end of file
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "piece.h"
+#include "paddle/fluid/string/piece.h"
 #include <string.h>

--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -71,6 +71,8 @@
 #include <iostream>
 #include <sstream>
+#include <string>
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 namespace paddle {

--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -11,7 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "printf.h"
+#include "paddle/fluid/string/printf.h"
 #include <string>
@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
  std::string weekday = "Wednesday";
  const char* month = "July";
  size_t day = 27;
-  long hour = 14;
+  int hour = 14;
  int min = 44;
  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,

--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #include <gtest/gtest.h>
 constexpr char kOutputString[] = "User Defined Output";
@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
 }
 TEST(to_string, normal) {
-  using namespace paddle::string;
+  using paddle::string::to_string;
  ASSERT_EQ("10", to_string(10));
  ASSERT_EQ("abc", to_string("abc"));
  ASSERT_EQ("1.2", to_string(1.2));
 }
 TEST(to_string, user_defined) {
-  using namespace paddle::string;
  UserDefinedClass instance;
-  ASSERT_EQ(kOutputString, to_string(instance));
+  ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
 }
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -14,6 +14,11 @@ function(gserver_test TARGET)
      COMMAND ${TARGET})
 endfunction()
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
 gserver_test(test_LayerGrad)
 gserver_test(test_CRFLayerGrad)
 gserver_test(test_CrossEntropyOverBeamGrad)
@@ -31,12 +36,12 @@ gserver_test(test_Upsample)
 set(PYTHON_PATH 
   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
 function(gserver_test_with_python TARGET)
  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
  add_test(NAME ${TARGET}
    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 gserver_test_with_python(test_PyDataProvider2)
@@ -57,7 +62,7 @@ if(WITH_MKLDNN)
        LayerGradUtil.cpp)
    add_test(NAME test_MKLDNN
        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 ############### test_WarpCTCLayer #######################
@@ -66,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
        test_WarpCTCLayer.cpp)
    add_test(NAME test_WarpCTCLayer
        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 if(NOT MOBILE_INFERENCE)
@@ -84,15 +89,15 @@ if(NOT MOBILE_INFERENCE)
    endif()
    add_test(NAME test_NetworkCompare
        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
    ############ test_CompareSparse ################
    add_unittest_without_exec(test_CompareSparse
        test_CompareSparse.cpp)
    if(NOT ON_TRAVIS)
      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
    endif()
 endif()
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ b/paddle/gserver/tests/test_Upsample.cpp
@@ -20,10 +20,8 @@ limitations under the License. */
 #include "paddle/math/MathUtils.h"
 #include "paddle/testing/TestUtil.h"
-using namespace paddle;
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
                   const string& poolType) {
  (*config).biasSize = 0;
  (*config).layerConfig.set_type("pool");
@@ -42,21 +40,23 @@ void setPoolConfig(TestConfig* config,
  pool->set_stride(sw);
  pool->set_stride_y(sh);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int ow =
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
  pool->set_output_x(ow);
  pool->set_output_y(oh);
 }
-LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
-                           const string& poolType,
+                                   const string& poolType,
-                           bool use_gpu,
+                                   bool use_gpu,
-                           real* tempGradData) {
+                                   real* tempGradData) {
  /* prepare maxPoolWithMaskLayer */
-  TestConfig config;
+  paddle::TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 128, 0});
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
  pool->set_img_size(8);
  pool->set_img_size_y(8);
@@ -66,9 +66,9 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
  config.layerConfig.set_name("MaxPoolWithMask");
-  std::vector<DataLayerPtr> dataLayers;
+  std::vector<paddle::DataLayerPtr> dataLayers;
-  LayerMap layerMap;
+  paddle::LayerMap layerMap;
-  vector<Argument> datas;
+  vector<paddle::Argument> datas;
  initDataLayer(config,
                &dataLayers,
@@ -82,20 +82,20 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
+  std::vector<paddle::ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
  /* prepare the upsample layer */
-  LayerConfig upsampleLayerConfig;
+  paddle::LayerConfig upsampleLayerConfig;
  upsampleLayerConfig.set_type("upsample");
-  LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
  upsampleLayerConfig.add_inputs();
-  UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
  upsampleConfig->set_scale(2);
-  ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
  imageConfig->set_channels(2);
  imageConfig->set_img_size(4);
  imageConfig->set_img_size_y(4);
@@ -103,17 +103,18 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
  upsampleLayerConfig.set_name("upsample");
  for (size_t i = 0; i < 2; i++) {
-    LayerInputConfig& inputTemp = *(upsampleLayerConfig.mutable_inputs(i));
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
    inputTemp.set_input_layer_name("MaxPoolWithMask");
  }
-  LayerPtr upsampleLayer;
+  paddle::LayerPtr upsampleLayer;
-  ParameterMap parameterMap;
+  paddle::ParameterMap parameterMap;
-  upsampleLayer = Layer::create(upsampleLayerConfig);
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
  upsampleLayer->init(layerMap, parameterMap);
  upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(PASS_GC);
+  upsampleLayer->forward(paddle::PASS_GC);
  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
  upsampleLayer->backward();
@@ -122,31 +123,31 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
 TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
  bool useGpu = false;
-  MatrixPtr inputMat;
+  paddle::MatrixPtr inputMat;
-  MatrixPtr inputGPUMat;
+  paddle::MatrixPtr inputGPUMat;
-  MatrixPtr tempGradMat;
+  paddle::MatrixPtr tempGradMat;
-  inputMat = Matrix::create(1, 128, false, useGpu);
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
  inputMat->randomizeUniform();
-  tempGradMat = Matrix::create(1, 128, false, useGpu);
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
  tempGradMat->randomizeUniform();
-  real* data = inputMat->getData();
  real* tempGradData = tempGradMat->getData();
-  LayerPtr upsampleLayerCPU =
+  paddle::LayerPtr upsampleLayerCPU =
      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
 #ifdef PADDLE_WITH_CUDA
  useGpu = true;
-  inputGPUMat = Matrix::create(1, 128, false, useGpu);
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
  inputGPUMat->copyFrom(data, 128);
-  LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                   upsampleLayerGPU->getOutput("").value);
+                           upsampleLayerGPU->getOutput("").value);
-  checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                   upsampleLayerGPU->getPrev(0)->getOutputGrad());
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
 #endif
 }
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
 set(PYTHON_PATH 
   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
 function(trainer_test TARGET)
  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
  add_test(NAME ${TARGET}
    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 trainer_test(test_Compare)
@@ -22,11 +27,11 @@ if(WITH_PYTHON)
  add_test(NAME test_TrainerOnePass
    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endif()
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,8 +2,8 @@
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
+  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
+set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
 if(APPLE)
    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
    get_filename_component(FIL_WE ${filename} NAME_WE)
    set(CUR_PROTO_GEN_PY
-            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
    set(PROTO_GEN_PY
            ${CUR_PROTO_GEN_PY}
            ${PROTO_GEN_PY})
    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
            DEPENDS ${ABS_FIL} protoc)
 endforeach()

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND touch stub.cc
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -31,7 +31,7 @@ import regularizer
 import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
 from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
    'LoDTensor',
    'CPUPlace',
    'CUDAPlace',
+    'CUDAPinnedPlace',
    'Tensor',
    'ParamAttr',
    'WeightNormParamAttr',

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -17,7 +17,7 @@ import framework
 from framework import Program, default_main_program, default_startup_program, Parameter, Variable
 import optimizer
 from layer_helper import LayerHelper
-from distributed_spliter import *
+import distributed_splitter as splitter
 import math
 from . import core
 import debuger
@@ -36,7 +36,7 @@ class VarBlock:
 class UnionFind(object):
    """ Union-find data struct.
    Union-find is a data struct that keeps track of a set of elements partitioned
    into a number of disjoint (non-overlapping) subsets.
@@ -138,7 +138,7 @@ class DistributeTranspiler:
                  program=None,
                  pservers="127.0.0.1:6174",
                  trainers=1,
-                  split_method=round_robin):
+                  split_method=splitter.round_robin):
        """
            Transpile the program to distributed data-parallelism programs.
            The main_program will be transformed to use a remote parameter server
@@ -303,7 +303,7 @@ class DistributeTranspiler:
        # If two ops are connected, we could add these two ops
        # into one set.
        ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2 
+        # step 4.2
        # Iterate through the ops and append optimize op which
        # located on current pserver
        opt_op_on_pserver = []
@@ -312,7 +312,7 @@ class DistributeTranspiler:
                opt_op_on_pserver.append(op)
        # step 4.3
        # Iterate through the ops, and if an op and the optimize ops
-        # which located on current pserver are in one set, then 
+        # which located on current pserver are in one set, then
        # append it into the sub program.
        # We try to put optimization program run parallelly, assume
@@ -408,11 +408,7 @@ class DistributeTranspiler:
        pserver_vars = pserver_program.global_block().vars
        created_var_map = dict()
        for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().create_var(
+            tmpvar = s_prog.global_block().clone_variable(var)
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
            created_var_map[var.name] = tmpvar
        # 2. rename op outputs
@@ -708,11 +704,7 @@ class DistributeTranspiler:
                varlist = [varlist]
            for var in varlist:
-                program.global_block().create_var(
+                program.global_block().clone_variable(var)
-                    name=var.name,
-                    persistable=var.persistable,
-                    dtype=var.dtype,
-                    shape=var.shape)
        optimize_block.append_op(
            type=opt_op.type,
@@ -760,7 +752,7 @@ class DistributeTranspiler:
    def _is_opt_op(self, op):
        # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
+        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
        if "Param" in op.input_names and \
            "LearningRate" in op.input_names:
            return True

--- a/python/paddle/fluid/distributed_spliter.py
+++ b/python/paddle/fluid/distributed_spliter.py
@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints):
    """
    hash variable names to several endpoints.
-    :param varlist: a list of Variables
+    Args:
-    :return: a map of pserver endpoint -> varname
+        varlist(list): a list of Variables
+    Returns(dict): a map of pserver endpoint -> varname
    """
    def _hash_block(block_str, total):
@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints):
 def round_robin(varlist, pserver_endpoints):
    """
-    distribute variables to several endpoints.
+    Distribute variables to several endpoints.
+    Args:
+        varlist(list): a list of variables
+        pserver_endpoints(list): a list of pserver endpoints
+    Returns(list[int]): the endpoint for each variable
    """
-    assert (len(varlist) > len(pserver_endpoints))
+    assert (len(varlist) >= len(pserver_endpoints))
    eplist = []
    pserver_idx = 0

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -640,6 +640,20 @@ class Operator(object):
        """
        return self.desc.block_attr(name)
+    def all_attrs(self):
+        """
+        Get the attribute dict
+        Returns(dict): The Operator's attribute dict
+        """
+        attr_names = self.attr_names
+        attr_map = {}
+        for n in attr_names:
+            if n == 'sub_block':
+                attr_map[n] = self.block_attr(n)
+            else:
+                attr_map[n] = self.attr(n)
+        return attr_map
 class Block(object):
    def __init__(self, program, idx):
@@ -838,7 +852,7 @@ class Block(object):
    def sync_with_cpp(self):
        """
-        Sync with the desc on the c++ end.
+        Sync from the desc on the c++ end.
        This method is used to synchronize the c++ desc instance generated by backward.
        """
@@ -946,13 +960,20 @@ class Block(object):
            The new  variable cloned from 'var' in current block.
        """
        assert isinstance(var, Variable)
-        return self.create_var(
+        ret_var = None
-            name=var.name,
+        # make STEP_SCOPES var can be safely cloned.
-            shape=var.shape,
+        if var.type == core.VarDesc.VarType.STEP_SCOPES:
-            dtype=var.dtype,
+            ret_var = self.create_var(
-            type=var.type,
+                name=var.name, persistable=var.persistable, type=var.type)
-            lod_level=var.lod_level,
+        else:
-            persistable=True)
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=True)
+        return ret_var
 class Program(object):

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -255,7 +255,32 @@ def _copy_reader_var_(block, var):
    new_var.desc.set_shapes(var.desc.shapes())
    new_var.desc.set_dtypes(var.desc.dtypes())
    new_var.persistable = True
-    return monkey_patch_reader_methods(new_var)
+    return new_var
+def _copy_reader_create_op_(block, op):
+    input_param_names = op.input_names
+    new_input_map = {}
+    for param_name in input_param_names:
+        new_input_map[param_name] = []
+        arg_names = op.input(param_name)
+        for arg_name in arg_names:
+            new_input_map[param_name].append(block.var(arg_name))
+    output_param_names = op.output_names
+    new_output_map = {}
+    for param_name in output_param_names:
+        new_output_map[param_name] = []
+        arg_names = op.output(param_name)
+        for arg_name in arg_names:
+            new_output_map[param_name].append(block.var(arg_name))
+    new_op = block.append_op(
+        type=op.type,
+        inputs=new_input_map,
+        outputs=new_output_map,
+        attrs=op.all_attrs())
+    return new_op
 def open_recordio_file(filename, shapes, lod_levels, dtypes):
@@ -283,8 +308,9 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
    startup_var.desc.set_dtypes(dtypes)
    startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 def open_files(filenames,
@@ -353,22 +379,25 @@ def open_files(filenames,
    startup_var.desc.set_dtypes(dtypes)
    startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+                                      startup_var)
+    return monkey_patch_reader_methods(main_prog_var)
 def __create_decorated_reader__(op_type, reader, attrs):
    var_name = unique_name(op_type)
    startup_blk = default_startup_program().current_block()
    startup_var = startup_blk.create_var(name=var_name)
-    startup_blk.append_op(
+    startop_op = startup_blk.append_op(
        type=op_type,
        inputs={'UnderlyingReader': reader},
        outputs={'Out': [startup_var]},
        attrs=attrs)
    startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
+    main_prog_block = default_main_program().current_block()
-                             startup_var)
+    main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
+    _copy_reader_create_op_(main_prog_block, startop_op)
+    return monkey_patch_reader_methods(main_prog_var)
 def create_shuffle_reader(reader, buffer_size):

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -26,25 +26,29 @@ class ParallelExecutor(object):
                 use_cuda,
                 num_threads=None,
                 allow_op_delay=False):
-        places = []
+        self._places = []
+        self._act_places = []
        if use_cuda:
            for i in xrange(core.get_cuda_device_count()):
                p = core.Place()
-                p.set_place(core.CUDAPlace(i))
+                self._act_places.append(core.CUDAPlace(i))
-                places.append(p)
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
        else:
            for i in xrange(multiprocessing.cpu_count()):
                p = core.Place()
-                p.set_place(core.CPUPlace())
+                self._act_places.append(core.CPUPlace(i))
-                places.append(p)
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
        if num_threads is None:
            if use_cuda:
                # Experiments on se-resnext shows that too many threads hurt
                # performance. Worth tunning for other models in the future.
-                num_threads = len(places)
+                num_threads = len(self._places)
            else:
-                min(len(places) * 2, multiprocessing.cpu_count())
+                min(len(self._places) * 2, multiprocessing.cpu_count())
        startup = framework.default_startup_program()
        main = framework.default_main_program()
@@ -53,7 +57,7 @@ class ParallelExecutor(object):
        self.executor = core.ParallelExecutor(
            num_threads,
            True if use_cuda else False,  # use_event
-            places,
+            self._places,
            set([
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
@@ -65,8 +69,25 @@ class ParallelExecutor(object):
            allow_op_delay)
        self.scope = scope
-    def run(self, fetch_list):
+    def run(self, fetch_list, feed_dict={}):
+        """
+        :param fetch_list: A list of variable names that will be fetched.
+        :param feed_dict: A dict mapping for feed variable name to LoDTensor
+          or numpy array.
+        :return: fetched value list.
+        """
+        if not isinstance(feed_dict, dict):
+            raise TypeError("feed_dict should be a dict")
+        feed_tensor_dict = {}
+        for i, feed_name in enumerate(feed_dict):
+            feed_tensor = feed_dict[feed_name]
+            if not isinstance(feed_tensor, core.LoDTensor):
+                feed_tensor = core.LoDTensor()
+                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
+            feed_tensor_dict[feed_name] = feed_tensor
        fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name)
+        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
        return [arr[i] for i in range(len(arr))]
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -22,9 +22,9 @@ function(py_test_modules TARGET_NAME)
    set(multiValueArgs MODULES DEPS ARGS ENVS)
    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
             ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction()

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest):
        }
        self.outputs = {'Output': output}
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
    def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
            place = core.CUDAPlace(0)
            self.check_output_with_place(place, atol=1e-5)
        else:
@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad(self):
        if self.dtype == np.float16:
            return
-        if self.use_cudnn:
+        if self.testcudnn():
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
                place,
@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad_no_filter(self):
        if self.dtype == np.float16:
            return
-        if self.use_cudnn:
+        if self.testcudnn():
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
                place, ['Input'],
@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest):
    def test_check_grad_no_input(self):
        if self.dtype == np.float16:
            return
-        if self.use_cudnn:
+        if self.testcudnn():
            place = core.CUDAPlace(0)
            self.check_grad_with_place(
                place, ['Filter'],

--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -115,18 +115,18 @@ class TestLookupTableWIsSelectedRows(OpTest):
        w_array = np.ones((len(rows), row_numel)).astype("float32")
        for i in range(len(rows)):
            w_array[i] *= i
-        ids_tensor = w_selected_rows.get_tensor()
+        w_tensor = w_selected_rows.get_tensor()
-        ids_tensor.set(w_array, place)
+        w_tensor.set(w_array, place)
        # create Out Variable
-        Out_tensor = scope.var('Out').get_tensor()
+        out_tensor = scope.var('Out').get_tensor()
        # create and run lookup_table operator
        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
        lookup_table.run(scope, place)
        # get result from Out
-        result_array = np.array(Out_tensor)
+        result_array = np.array(out_tensor)
        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
        for idx, row in enumerate(ids_array):
            assert (row[0] == result_array[idx]).all()

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -21,13 +21,17 @@ import paddle.dataset.mnist as mnist
 import paddle.dataset.wmt16 as wmt16
-def simple_fc_net():
+def simple_fc_net(use_feed):
-    reader = fluid.layers.open_recordio_file(
+    if use_feed:
-        filename='./mnist.recordio',
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        shapes=[[-1, 784], [-1, 1]],
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        lod_levels=[0, 0],
+    else:
-        dtypes=['float32', 'int64'])
+        reader = fluid.layers.open_recordio_file(
-    img, label = fluid.layers.read_file(reader)
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
    hidden = img
    for _ in xrange(4):
        hidden = fluid.layers.fc(
@@ -42,13 +46,18 @@ def simple_fc_net():
    return loss
-def fc_with_batchnorm():
+def fc_with_batchnorm(use_feed):
-    reader = fluid.layers.open_recordio_file(
+    if use_feed:
-        filename='./mnist.recordio',
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        shapes=[[-1, 784], [-1, 1]],
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        lod_levels=[0, 0],
+    else:
-        dtypes=['float32', 'int64'])
+        reader = fluid.layers.open_recordio_file(
-    img, label = fluid.layers.read_file(reader)
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
    hidden = img
    for _ in xrange(1):
        hidden = fluid.layers.fc(
@@ -135,7 +144,9 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
-def SE_ResNeXt152Small(batch_size=2):
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
    img = fluid.layers.fill_constant(
        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
    label = fluid.layers.fill_constant(
@@ -150,9 +161,9 @@ def SE_ResNeXt152Small(batch_size=2):
    conv = fluid.layers.pool2d(
        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
-    cardinality = 64
+    cardinality = 32
    reduction_ratio = 16
-    depth = [3, 8, 36, 3]
+    depth = [3, 4, 6, 3]
    num_filters = [128, 256, 512, 1024]
    for block in range(len(depth)):
@@ -185,30 +196,28 @@ class TestParallelExecutorBase(unittest.TestCase):
                                  memory_opt=True,
                                  iter=10,
                                  batch_size=None,
-                                  allow_op_delay=False):
+                                  allow_op_delay=False,
+                                  feed_dict={}):
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
-            loss = method()
+            loss = method(use_feed=len(feed_dict) > 0)
            adam = fluid.optimizer.Adam()
            adam.minimize(loss)
            if memory_opt:
                fluid.memory_optimize(main)
-            exe = fluid.ParallelExecutor(
+            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
-                loss_name=loss.name,
-                use_cuda=True,
-                allow_op_delay=allow_op_delay)
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
-            first_loss, = exe.run([loss.name])
+            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
            first_loss = numpy.array(first_loss)
            for i in xrange(iter):
-                exe.run([])
+                exe.run([], feed_dict=feed_dict)
-            last_loss, = exe.run([loss.name])
+            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
            end = time.time()
            if batch_size is not None:
@@ -242,9 +251,19 @@ class TestMNIST(TestParallelExecutorBase):
        self.check_network_convergence(simple_fc_net)
        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net, feed_dict={"image": img,
+                                      "label": label})
    def test_batchnorm_fc(self):
        self.check_network_convergence(fc_with_batchnorm)
-        self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True)
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm, feed_dict={"image": img,
+                                          "label": label})
 class TestResnet(TestParallelExecutorBase):
@@ -271,7 +290,7 @@ class TestResnet(TestParallelExecutorBase):
        batch_size = 2
        self.check_network_convergence(
            functools.partial(
-                SE_ResNeXt152Small, batch_size=batch_size),
+                SE_ResNeXt50Small, batch_size=batch_size),
            iter=20,
            batch_size=batch_size)
@@ -400,7 +419,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
 import transformer_model
-def transformer():
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
    return transformer_model.transformer(
        ModelHyperParams.src_vocab_size + 1,
        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,

--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
        self.attrs = {
            'min_sizes': self.min_sizes,
-            'max_sizes': self.max_sizes,
            'aspect_ratios': self.aspect_ratios,
            'variances': self.variances,
            'flip': self.flip,
@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
            'step_h': self.step_h,
            'offset': self.offset
        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
    def test_check_output(self):
        self.check_output()
-    def test_check_grad(self):
-        return
    def setUp(self):
        self.op_type = "prior_box"
        self.set_data()
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
    def init_test_params(self):
-        self.layer_w = 4
+        self.layer_w = 32
-        self.layer_h = 4
+        self.layer_h = 32
-        self.image_w = 20
+        self.image_w = 40
-        self.image_h = 20
+        self.image_h = 40
        self.step_w = float(self.image_w) / float(self.layer_w)
        self.step_h = float(self.image_h) / float(self.layer_h)
@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
        self.min_sizes = [2, 4]
        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.max_sizes = [5, 10]
+        self.set_max_sizes()
-        self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
        self.aspect_ratios = [2.0, 3.0]
        self.flip = True
        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
        self.clip = True
        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 1:
+        if len(self.max_sizes) > 0:
            self.num_priors += len(self.max_sizes)
        self.offset = 0.5
@@ -105,35 +106,27 @@ class TestPriorBoxOp(OpTest):
                idx = 0
                for s in range(len(self.min_sizes)):
                    min_size = self.min_sizes[s]
-                    c_w = c_h = min_size / 2.
+                    # rest of priors
-                    out_boxes[h, w, idx, :] = [
+                    for r in range(len(self.real_aspect_ratios)):
-                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
+                        ar = self.real_aspect_ratios[r]
-                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
+                        c_w = min_size * math.sqrt(ar) / 2
-                    ]
+                        c_h = (min_size / math.sqrt(ar)) / 2
-                    idx += 1
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                   (c_y - c_h) / self.image_h,
                                                   (c_x + c_w) / self.image_w,
                                                   (c_y + c_h) / self.image_h]
                        idx += 1
-                    # rest of priors
+                    if len(self.max_sizes) > 0:
-                    for r in range(len(self.real_aspect_ratios)):
+                        max_size = self.max_sizes[s]
-                        ar = self.real_aspect_ratios[r]
+                        # second prior: aspect_ratio = 1,
-                        if math.fabs(ar - 1.) < 1e-6:
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
-                            continue
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                   (c_y - c_h) / self.image_h,
                                                   (c_x + c_w) / self.image_w,
                                                   (c_y + c_h) / self.image_h]
                        idx += 1
        # clip the prior's coordidate such that it is within[0, 1]
        if self.clip:
            out_boxes = np.clip(out_boxes, 0.0, 1.0)
@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
        self.out_var = out_var.astype('float32')
+class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -19,9 +19,9 @@ from paddle.fluid.framework import Program
 class TestOpDesc(unittest.TestCase):
    def test_op_desc(self):
-        prog = core.ProgramDesc()
+        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog)
+        self.assertIsNotNone(program_desc)
-        block = prog.block(0)
+        block = program_desc.block(0)
        self.assertIsNotNone(block)
        op = block.append_op()
        self.assertIsNotNone(op)
@@ -67,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
        self.assertEqual(8, len(op.attr_names()))
-        op.set_block_attr("block_attr", prog.block(0))
+        op.set_block_attr("block_attr", program_desc.block(0))
        self.assertEqual(0, op.block_attr("block_attr"))
        mul_op = block.append_op()
@@ -88,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
        del program_desc
    def test_append_block(self):
-        prog_desc = core.ProgramDesc()
+        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog_desc)
+        self.assertIsNotNone(program_desc)
-        block_root = prog_desc.block(0)
+        block_root = program_desc.block(0)
        self.assertIsNotNone(block_root)
        self.assertEqual(block_root.id, 0)
-        block1 = prog_desc.append_block(block_root)
+        block1 = program_desc.append_block(block_root)
-        block2 = prog_desc.append_block(block1)
+        block2 = program_desc.append_block(block1)
        self.assertIsNotNone(block1)
        self.assertEqual(block1.id, block2.parent)
        self.assertEqual(block_root.id, block1.parent)
-        block3 = prog_desc.append_block(block_root)
+        block3 = program_desc.append_block(block_root)
        self.assertEqual(block3.parent, block_root.id)
-        self.assertEqual(prog_desc.block(1).id, 1)
+        self.assertEqual(program_desc.block(1).id, 1)
-        self.assertEqual(4, prog_desc.num_blocks())
+        self.assertEqual(4, program_desc.num_blocks())
 class TestVarDesc(unittest.TestCase):
@@ -162,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
 class TestBlockDesc(unittest.TestCase):
    def test_add_var(self):
-        prog = core.ProgramDesc()
+        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog)
+        self.assertIsNotNone(program_desc)
-        block = prog.block(0)
+        block = program_desc.block(0)
        self.assertIsNotNone(block)
        var1 = block.var("var1")
        var2 = block.var("var2")
@@ -175,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
        self.assertEqual(var2_re, var2)
    def test_add_op(self):
-        prog = core.ProgramDesc()
+        program_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog)
+        self.assertIsNotNone(program_desc)
-        block = prog.block(0)
+        block = program_desc.block(0)
        self.assertIsNotNone(block)
        op1 = block.append_op()
        op2 = block.append_op()
@@ -189,9 +189,9 @@ class TestBlockDesc(unittest.TestCase):
    def test_remove_op(self):
        program = Program()
-        prog = program.desc
+        program_desc = program.desc
-        self.assertIsNotNone(prog)
+        self.assertIsNotNone(program_desc)
-        block = prog.block(0)
+        block = program_desc.block(0)
        self.assertIsNotNone(block)
        op0 = block.append_op()

--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -15,8 +15,8 @@
 import unittest
 import paddle.fluid as fluid
-import paddle
+import paddle.v2 as paddle
-import paddle.dataset.mnist as mnist
+import paddle.v2.dataset.mnist as mnist
 class TestRecordIO(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
            self.check_with_place(place)
+class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        row_width = 12
+        # create and initialize Grad Variable
+        grad_height = 10
+        grad_rows = [0, 4, 7]
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(grad_height)
+        grad_selected_rows.set_rows(grad_rows)
+        grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
+        grad_array[0, 0] = 2.0
+        grad_array[2, 8] = 4.0
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_array, place)
+        # create and initialize Param Variable
+        # create and initialize W Variable
+        param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
+        # init Param
+        w_selected_rows = scope.var('Param').get_selected_rows()
+        w_selected_rows.set_height(len(param_rows))
+        w_selected_rows.set_rows(param_rows)
+        w_array = np.ones((len(param_rows), row_width)).astype("float32")
+        for i in range(len(param_rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+        w_before_optimize = np.array(w_tensor)
+        # create and initialize LeraningRate Variable
+        lr_value = 0.1
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), lr_value).astype("float32")
+        lr.set(lr_array, place)
+        # optimize with Python
+        w_after_optimize = np.copy(w_before_optimize)
+        for index, id in enumerate(grad_rows):
+            w_after_optimize[id] = w_before_optimize[
+                id] - lr_value * grad_array[index]
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+        # get and compare result
+        result_array = np.array(w_tensor)
+        assert (result_array == w_after_optimize).all()
+    def test_sparse_parameter_sgd(self):
+        places = [core.CPUPlace()]
+        # do not support GPU kernel currently
+        for place in places:
+            self.check_with_place(place)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
        self.use_cudnn = True
+class TestSoftmaxFP16Op(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
    def init_kernel_type(self):
        self.use_cudnn = True

--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
-  COMMAND
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -2,7 +2,6 @@
 set -e
 cd `dirname $0`
-export PYTHONPATH=$PWD/../../../../
 protostr=$PWD/protostr
 . file_list.sh

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -58,7 +58,7 @@ def mkl():
            'istaged': ISTAGED,
            'with_mkl': '@WITH_MKL@'})
-write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 packages=['paddle',
@@ -107,9 +107,10 @@ package_dir={
    # So that package points to other directory.
    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 paddle_rt_lib_dir = 'lib'