diff --git a/.gitignore b/.gitignore
index 2badc3bdaa52f2608183fa34393719be66630654..9e3a0b499f9f42856429f3a42bef313ea3df3699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 
 # clion workspace.
 cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index df3f0c7f0c31efaa127515bb98e5668b8f9df199..796bcf28a1dfb308ccb7a2f839742c5c2fcf2002 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf..8f7a3bf8eeaef75c8840f4ea318b484d33249bb7 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
         "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
 add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..a631ad14b18310598f7eea3a51839d61a9e456ff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 20b8506e678af4db6ccb65bef99d28e085a67bf2..c3d73235453c8c9fd2859c3ab142888e8bda2dbe 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
 ExternalProject_Add(
     extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3fe750f47efc149bb1af6086841bffd5dd8e85fd..e8bc285bdc95e213b9da2ee388078349a46d2798 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -251,7 +251,7 @@ function(cc_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction(cc_test)
 
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 9fe79323ef9377a459d8405cfa74c88c52ce9346..8086507bb4b7e870ad6d6091945ed07a00b5100b 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
 
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
 
 add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index ca40dfb9644cea69329be0ec231378506c138bc0..48b396f0786adad1ba6cd41f72497f853e54bc38 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index f627437f354a12c79cad25c959409db29ecbd874..b123b756e2251c38f319e1aefa2cb04fd7a36b03 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -9,5 +9,5 @@
   use_eigen_cn.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_cn.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 0b65fed67ad45eb399b624184485a99a082d79e9..98988fc22dcedecdbcd67fb3bf761377bf046337 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -9,5 +9,5 @@ Development
   use_eigen_en.md
   name_convention.md
   support_new_device.md
-  releasing_process.md
+  releasing_process_en.md
   op_markdown_format.md
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process_cn.md
similarity index 74%
rename from doc/fluid/dev/releasing_process.md
rename to doc/fluid/dev/releasing_process_cn.md
index c5943ccd81c2ae2aaacd2676da12509db889f54a..4c6728fba7150b0f1e180e57590f18a5b677c70d 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -10,19 +10,10 @@ PaddlePaddle每次发新的版本，遵循以下流程:
   * 使用Regression Test List作为检查列表，测试本次release的正确性。
 	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
 	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
-		* 上传方法：
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
 
 需要注意的是:
 
@@ -31,13 +22,18 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 
 ## 发布wheel包到pypi
 
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
 完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
-<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
 
 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
   发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
@@ -48,10 +44,20 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
 版本号对应的tag即可：
 
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
 
 ## PaddlePaddle 分支规范
 
@@ -76,7 +82,7 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 
 ### PaddlePaddle Book中所有章节
 
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
 
 <table>
 <thead>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,210 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 260b6c9fd1b364433cae098bacea77aa7fe9e266..76b82fd97f1ed642696c4414676b694ebda9ad81 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e5757b86b43001bc6090d8edd0aaa5ff4fc476ee..5aa5c1381fa3fad4ebc181c7868da03ae0138016 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 82de7a3a3e1ca7724e1eda877d53454a4fa4129a..be957d37b14c618e9346251b3bd3dbaf1541773f 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
 
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
 
-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
 
 add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index da1eafc02ed8cd155d4f0f1fbadcb7b237b6fcc1..2670a21a227546ffcee4f10f395feef3c58df9b4 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index cf84568ecdf1227b0d0ed3606a4a9a6e5186af72..06e1f5d5f0884efabfcdf917ca5c35d94ad5dce9 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 761aeb5b174105edece8880a9f5012c13a63fd11..13cb79129cc2272d215cdb475fb146b37266699e 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)
diff --git a/paddle/fluid/framework/.clang-format b/paddle/fluid/.clang-format
similarity index 100%
rename from paddle/fluid/framework/.clang-format
rename to paddle/fluid/.clang-format
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c425c71160a8fa3830a5fbdae1baaed850710877..a473ed7400012b7d0cbc5ab9bed263b3cca8c6ec 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 468423e0e8e7b8c9ebc14b7568c9c3bd21645ea7..873969b2a884f6d9e133fe87bf72725c36ce8b98 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -96,6 +97,8 @@ class BlockDesc {
    */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
   std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 019bea600f496a6b58579ad0aa8af836cd6134a9..722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>  // for size_t
-#include <condition_variable>
+#include <stddef.h>            // for size_t
+#include <condition_variable>  // NOLINT
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"
 
@@ -216,7 +216,8 @@ class ChannelHolder {
 
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
       channel_.reset(MakeChannel<T>(buffer_size));
     }
 
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index e056779ea0dd0a31191b628f82724298efaf50ff..26d454534e1ae38c4f83376c0836a45781ea9101 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>  // for size_t
 #include <atomic>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   virtual void Unlock();
   virtual bool IsClosed();
   virtual void Close();
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
   virtual ~ChannelImpl();
 
   virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     const void *referrer;  // TODO(thuan): figure out better way to do this
     std::function<bool(ChannelAction)> callback;
 
-    QueueMessage(T *item)
+    explicit QueueMessage(T *item)
         : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
 
     QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   }
 
   std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
-    while (!queue.empty()) {
+      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
+    while (!queue->empty()) {
       // Check whether this message was added by Select
       // If this was added by Select then execute the callback
       // to check if you can execute this message. The callback
       // can return false if some other case was executed in Select.
       // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue.front();
-      queue.pop_front();
+      std::shared_ptr<QueueMessage> m = queue->front();
+      queue->pop_front();
       if (m->callback == nullptr || m->callback(action)) return m;
     }
     return nullptr;
@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) {
   // to send to the receiver, bypassing the channel buffer if any
   if (!recvq.empty()) {
     std::shared_ptr<QueueMessage> m =
-        get_first_message(recvq, ChannelAction::SEND);
+        get_first_message(&recvq, ChannelAction::SEND);
 
     if (m != nullptr) {
       *(m->data) = std::move(*item);
@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) {
   // buffer and move front of send queue to the buffer
   if (!sendq.empty()) {
     std::shared_ptr<QueueMessage> m =
-        get_first_message(sendq, ChannelAction::RECEIVE);
+        get_first_message(&sendq, ChannelAction::RECEIVE);
     if (buf_.size() > 0) {
       // Case 1 : Channel is Buffered
       // Do Data transfer from front of buffer
@@ -219,8 +219,9 @@ bool ChannelImpl<T>::Receive(T *item) {
       if (m != nullptr) {
         *item = std::move(*(m->data));
         m->Notify();
-      } else
+      } else {
         return recv_return(Receive(item));
+      }
     }
     return recv_return(true);
   }
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 1184bfdae1940286fb72d9091ae4f23ff7f84a54..542d791f6bbdf7d68a4786998ccc0233fff6473d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/channel.h"
 
-#include <chrono>
-#include <thread>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
+      if (i < buffer_size) {
         ch->Send(&i);  // should block after 10 iterations
-      else {
+      } else {
         bool is_exception = false;
         try {
           ch->Send(&i);
@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
 }
 
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a buffered channel also unblocks
@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successful sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }
 
 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a channelholder unblocks
@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successfuld sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
 
 // This tests that closing a channelholder many times.
 void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int num_threads = 15;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const int kNumThreads = 15;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < num_threads / 3; i++) {
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *ended) {
@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to receive data to channel.
-  for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   }
 
   // Launches threads that try to close the channel.
-  for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
   EXPECT_TRUE(ch->IsClosed());
   // delete the channel
   delete ch;
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index bf1a705ef50b663efa53393ead1f81fd6bcf8c48..89b5c6847f15b3f2a270fe1e7db9e590549e8982 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
             scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index dee505fee0dccd8d60bb290a8bec4df243e504a2..4f130d265900483ec7a7c541f2610d17a352913f 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
     return (lod_)[level].size() - 1;
   }
 
+  // Split LoDTensor and copy to each place specified in places.
   std::vector<LoDTensor> SplitLoDTensor(
       const std::vector<platform::Place> places) const;
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f6a43804ef2fd73c4a2c2c3b3dfbb90bff1c451b..a3b4a8c0829ae3324e933309b2eaea35fe571997 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+  if (var->IsType<framework::LoDTensor>()) {
+    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return framework::ToDataType(
+        var->Get<framework::SelectedRows>().value().type());
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+}
+
 static DDim GetDims(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 41214b41cb68cbd7049552f39195ae5257e0d06f..b7a7c69b4c8493f945926c75797c49d327a3197e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
 }
 
+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+
 class OperatorBase;
 class ExecutionContext;
 
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 17885143247f0e0db8f12931e3c3412e7114ef3d..7be93fa6002ae93c3e1b75c8f7fe5ca5f40b271f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -150,13 +150,30 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
 }
 
-void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
-                           const std::string &fetched_var_name) {
+void ParallelExecutor::Run(
+    const std::vector<std::string> &fetch_tensors,
+    const std::string &fetched_var_name,
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
   platform::RecordBlock b(0);
+  SplitTensorToPlaces(feed_tensors);
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
 }
 
+void ParallelExecutor::SplitTensorToPlaces(
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
+  for (auto it : feed_tensors) {
+    auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
+      // TODO(panxy0718): Do I need to delete this var?
+      member_->local_scopes_[j]
+          ->Var(it.first)
+          ->GetMutable<LoDTensor>()
+          ->ShareDataWith(lod_tensors[j]);
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 964b476234e622cae934d41bc3793bc3114a5f1a..c7c58b2b808383621a6d492f9188b0d36bfa6858 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,9 +42,13 @@ class ParallelExecutor {
                             bool allow_op_delay);
 
   void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name = "fetched_var");
+           const std::string& fetched_var_name,
+           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
  private:
+  void SplitTensorToPlaces(
+      const std::unordered_map<std::string, LoDTensor>& feed_tensors);
+
   ParallelExecutorPrivate* member_;
 
   void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 504344e937dfdc362cdc22298a5f963d87011e9d..d9d6b7dd67f1c6e4bbd6a4e1a8f0843d4cb93c05 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 9458d56a01df432aea573d796456b9be31350038..8e2d9470d3954e0f66c74828a8d8292c2875a8f4 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,6 +50,15 @@ class SelectedRows {
 
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
+  /**
+   * get the index of id in rows
+   */
+  int64_t index(int64_t id) const {
+    auto it = std::find(rows_.begin(), rows_.end(), id);
+    PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
     dims[0] = height_;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 7a4839044008338dda43f75b5ee6def500b78270..f49d1a47a325b2aac6185073203df124be18b54d 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -128,13 +128,20 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
     if (platform::is_cpu_place(place)) {
       holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
           boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place)) {
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
     }
 #endif
     offset_ = 0;
@@ -145,7 +152,7 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
 
 inline void* Tensor::mutable_data(platform::Place place) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
+                 "Cannot invoke mutable data if current hold nothing.");
   return mutable_data(place, holder_->type());
 }
 
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 78996908b18a5a0935d8de9920e8ccef9069e74b..f6c6a1fec13d8b12efd1fa71a7a93316e484d045 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
  public:
   using ElementVars = std::vector<ElementVar>;
 
-  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+  Tuple(const std::vector<ElementVar>& var,
+        const std::vector<VarDesc>& var_desc)
       : var_(var), var_desc_(var_desc) {}
-  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
 
-  ElementVar get(int idx) const { return var_[idx]; };
+  ElementVar get(int idx) const { return var_[idx]; }
 
-  ElementVar& get(int idx) { return var_[idx]; };
+  ElementVar& get(int idx) { return var_[idx]; }
 
-  bool isSameType(Tuple& t) const;
+  bool isSameType(const Tuple& t) const;
 
-  size_t getSize() const { return var_.size(); };
+  size_t getSize() const { return var_.size(); }
 
  private:
   ElementVars var_;
   std::vector<VarDesc> var_desc_;
 };
 
-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
   size_t tuple_size = getSize();
   if (tuple_size != t.getSize()) {
     return false;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2..a5b62ef322bfad0fc956d7d722797bd5add6aea6 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
-    framework::Scope& scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
+    framework::Executor& executor, framework::Scope& scope,
+    const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
   ReadBinaryFile(model_filename, program_desc_str);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 6817a6fca047c9336233697a7bee4e5e16eedd5e..d07d315b93ef10a464080899b1cb9920abe83be3 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -24,8 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index e7ffb00ec8d8926193fe510ebdb7185f75c90906..6ed77adb9d891c75e7de358d0d7a0c06c9af96dd 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
   set(multiValueArgs ARGS)
   cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
   set(arg_list "")
   if(inference_test_ARGS)
     foreach(arg ${inference_test_ARGS})
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec3abdb86724fb16725962958c5cf55c..3e77dc166c355bc141563eda4705ca8d75226ac4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
   // The second dim of the input tensor should be 13
   // The input data should be >= 0
   int64_t batch_size = 10;
-  SetupTensor<float>(
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
+  SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
+                     static_cast<float>(10));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index 9126efb8c2ee5e38bc84f65c8ba7ad5401894268..46419f70a7e3d7571e0a8750cc8e51fa82946d9e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
+                     static_cast<float>(0), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CPU
   LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+  TestInference<paddle::platform::CPUPlace, true>(dirname, cpu_feeds,
+                                                  cpu_fetchs1, FLAGS_repeat);
   LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace, true>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+  TestInference<paddle::platform::CUDAPlace, true>(dirname, cpu_feeds,
+                                                   cpu_fetchs2, FLAGS_repeat);
   LOG(INFO) << output2.dims();
 
   CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 184924016634bba26204d937744ca5fa87cd443c..84bb855fea5fa397ff71e2c922fea3302951b7ca 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
   int64_t predicate_dict_len = 3162;
   int64_t mark_dict_len = 2;
 
-  SetupLoDTensor(word,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(mark_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 1fb0f9e77797cf6e61e918700763ee33a495cb96..f12828a2685305c20d26492dbf04fa9ddacf9317 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 1, 28, 28},
-                     static_cast<float>(-1),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
+                     static_cast<float>(-1), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CPU
     LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+                                              FLAGS_repeat, is_combined);
     LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CUDA GPU
     LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(
-        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+                                               FLAGS_repeat, is_combined);
     LOG(INFO) << output2.dims();
 
     CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
index b42a33c9a90b5feafaed343a197da0e4db11b7ea..70aa6b194d4417fc85384cc3f615089f024f928e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
 
   // Use the first data from paddle.dataset.movielens.test() as input
   std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
 
   std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
 
   std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
 
   std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
 
   std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
 
   std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
 
   std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&user_id);
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
index a0523905bd1631cd8768b1601e459cb9d110a84d..e15c3f59acb1eac535120554a3799c37e9d4e951 100644
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
   paddle::framework::LoDTensor word_data, trg_word;
   paddle::framework::LoD lod{{0, 4, 10}};
 
-  SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
+  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word_data);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index 824b3274ebc7ba046e61798b3f61ef9924a75679..0dbb6a30405eb64133613052ad57b1f705a9e7b4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
   paddle::framework::LoD lod{{0, 4, 10}};
   int64_t word_dict_len = 5147;
 
-  SetupLoDTensor(words,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index 1481760c529c29a7290f476e2a22e1ded5ab7787..c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
   paddle::framework::LoD lod{{0, 1}};
   int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index d559cc7d038b4313f8824e207cc69a6173941790..eebab888e3dd8c7df2c50ac5070b62a47a92c723 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+
+#include <map>
+#include <random>
+#include <string>
+#include <vector>
 
-#include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, T lower, T upper) {
+  std::mt19937 rng(100);  // An arbitrarily chosen but fixed seed.
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input->numel(); ++i) {
+    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
   }
 }
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 std::vector<T>& data) {
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, const std::vector<T>& data) {
   CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
+                    const paddle::framework::LoD& lod, T lower, T upper) {
+  input->set_lod(lod);
   int dim = lod[0][lod[0].size() - 1];
   SetupTensor<T>(input, {dim, 1}, lower, upper);
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                     paddle::framework::DDim dims,
-                    paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const paddle::framework::LoD lod,
+                    const std::vector<T>& data) {
   const size_t level = lod.size() - 1;
   CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
   SetupTensor<T>(input, dims, data);
 }
 
 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+void CheckError(const paddle::framework::LoDTensor& output1,
+                const paddle::framework::LoDTensor& output2) {
   // Check lod information
   EXPECT_EQ(output1.lod(), output2.lod());
 
@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1,
 template <typename Place, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1,
-                   const bool is_combined = false) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1, const bool is_combined = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
@@ -132,11 +131,9 @@ void TestInference(const std::string& dirname,
       //  `fluid.io.save_inference_model`.
       std::string prog_filename = "__model_combined__";
       std::string param_filename = "__params_combined__";
-      inference_program =
-          paddle::inference::Load(executor,
-                                  *scope,
-                                  dirname + "/" + prog_filename,
-                                  dirname + "/" + param_filename);
+      inference_program = paddle::inference::Load(
+          executor, *scope, dirname + "/" + prog_filename,
+          dirname + "/" + param_filename);
     } else {
       // Parameters are saved in separate files sited in the specified
       // `dirname`.
@@ -173,8 +170,8 @@ void TestInference(const std::string& dirname,
     std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
     if (PrepareContext) {
       ctx = executor.Prepare(*inference_program, 0);
-      executor.RunPreparedContext(
-          ctx.get(), scope, feed_targets, fetch_targets);
+      executor.RunPreparedContext(ctx.get(), scope, feed_targets,
+                                  fetch_targets);
     } else {
       executor.Run(*inference_program, scope, feed_targets, fetch_targets);
     }
@@ -191,8 +188,8 @@ void TestInference(const std::string& dirname,
       if (PrepareContext) {
         // Note: if you changed the inference_program, you need to call
         // executor.Prepare() again to get a new ExecutorPrepareContext.
-        executor.RunPreparedContext(
-            ctx.get(), scope, feed_targets, fetch_targets);
+        executor.RunPreparedContext(ctx.get(), scope, feed_targets,
+                                    fetch_targets);
       } else {
         executor.Run(*inference_program, scope, feed_targets, fetch_targets);
       }
diff --git a/paddle/fluid/memory/.clang-format b/paddle/fluid/memory/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
index 09f82166beab369416e351dbb8ecd09f759bfbda..2c13dbc6d51bfa3853cec5270e8115c899f522ea 100644
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
@@ -95,7 +95,7 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     int cur_dev = platform::GetCurrentDeviceId();
     platform::SetDeviceId(place.device);
     size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
+    platform::GpuMemoryUsage(&avail, &total);
     LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                  << place.device << ", available " << avail << " bytes";
     LOG(WARNING) << "total " << total;
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
index 03829702a0c5c3dc177381b4ad3d012fda8f537d..9fbbe62559b1e29d6942a1ada62558b20830489b 100644
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/memory.h"
+
+#include <unordered_map>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/meta_data.h"
-
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <gtest/gtest.h>
-#include <unordered_map>
-
 inline bool is_aligned(void const *p) {
   return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
 }
diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 952ac8b1dcf9221d0e5718cc073f1e390176e5a2..84eabab563e3404ad2a28bf76116c592db04742e 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index a32aba4c1ff2f5e775aeb41f25b02322dbc6a64a..c70e3cc3c9198008d9eca5f462000aa67ff7e5ba 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
     // get workspace size able to allocate
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // Allocate on GPU memory
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 9c704a2949f7100e0812eafe1e58ef04bf71f840..847b7b0c12e1679501dbe83d578b23ca2aef3e9e 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -27,8 +27,8 @@ template <typename T>
 class MKLDNNMD {
  public:
   explicit MKLDNNMD(const T* in, const T* w, bool bias)
-      : in{paddle::framework::vectorize2int(in->dims())},
-        w{paddle::framework::vectorize2int(w->dims())} {
+      : in(paddle::framework::vectorize2int(in->dims())),
+        w(paddle::framework::vectorize2int(w->dims())) {
     with_bias_ = bias;
   }
 
@@ -78,7 +78,7 @@ class MKLDNNMD {
 class MKLDNNMemory {
  public:
   MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
-      : md_{t}, engine_{e} {}
+      : md_(t), engine_(e) {}
   virtual ~MKLDNNMemory() = default;
 
   template <typename Output>
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index deabcdc99f819851b2df9bb0c7b05a5b339568f3..bf33be310686640fa187a07cf46a157b7f433340 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -18,22 +18,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-static inline framework::OpKernelType ExpectedKernelType(
-    const framework::ExecutionContext& ctx) {
-  auto* table_var = ctx.InputVar("W");
-  if (table_var->IsType<LoDTensor>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<LoDTensor>().type()),
-        ctx.device_context());
-  } else if (table_var->IsType<SelectedRows>()) {
-    return framework::OpKernelType(
-        framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
-        ctx.device_context());
-  } else {
-    PADDLE_THROW("W should be LoDTensor or SelectedRows");
-  }
-}
-
 class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return ExpectedKernelType(ctx);
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index fff5edda62d4b115605a4cab35ed5457b4db5f21..cb088c267bcc028ff11583cd73de5ca1722a9b69 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
 using DDim = framework::DDim;
 
-static constexpr int64_t kNoPadding = -1;
-
-inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
-  auto it = std::find(rows.begin(), rows.end(), value);
-  PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
-  return static_cast<size_t>(std::distance(rows.begin(), it));
-}
+constexpr int64_t kNoPadding = -1;
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
     }
 
     int64_t *ids;
@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
           memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
           PADDLE_ENFORCE_GE(ids[i], 0);
-          auto id_index = getIndex(table_t.rows(), ids[i]);
+          auto id_index = table_t.index(ids[i]);
           memcpy(output + i * row_width, table + id_index * row_width,
                  row_width * sizeof(T));
         }
@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       auto *table_t = context.Input<SelectedRows>("W");
       table_dim = table_t->value().dims();
     } else {
-      PADDLE_THROW("table only support LoDTensor and SelectedRows");
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
     }
 
     bool is_sparse = context.Attr<bool>("is_sparse");
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 1e909db5288afccb9dd0be08a45cf3c27048ae6f..82e12943148a806bae719c722944d6a9d5236b7c 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
+                                                        CUBLAS_DEFAULT_MATH));
+  }
+#endif
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
 }
 
 template <>
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 5518ebed3f792a5acdfbb27976bc2c6dbd78069a..a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <vector>
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;
 
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 3e123f7bf5512618538fd35aa7e74b82586a5448..dd9971ba091cc3ece86654f65c335b98087f45ed 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct ValueClip {
   HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
+    const T kThreshold = static_cast<T>(-64.);
     return x < kThreshold ? kThreshold : x;
   }
 };
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index c22a55bce263423d5c17fffdb06b7ece02ae26da..82e54139c8c1f42b1d8f74811a6793ec5c66473e 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
+        ctx.device_context());
   }
 };
 
@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
+                       ops::PriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..76bf2b3b7de7a24c80e927c16199f89c5b7fb794
--- /dev/null
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
+                            const int width, const int im_height,
+                            const int im_width, const int as_num,
+                            const T offset, const T step_width,
+                            const T step_height, const T* min_sizes,
+                            const T* max_sizes, const int min_num,
+                            bool is_clip) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (s < as_num) {
+        T ar = aspect_ratios[s];
+        bw = min_size * sqrt(ar) / 2.;
+        bh = min_size / sqrt(ar) / 2.;
+      } else {
+        T max_size = max_sizes[m];
+        bw = sqrt(min_size * max_size) / 2.;
+        bh = bw;
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto im_width = image->dims()[3];
+    auto im_height = image->dims()[2];
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(im_width) / width;
+      step_height = static_cast<T>(im_height) / height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+    int min_num = static_cast<int>(min_sizes.size());
+    int box_num = width * height * num_priors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor r;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
+
+    framework::Tensor min;
+    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
+
+    T* max_data = nullptr;
+    framework::Tensor max;
+    if (max_sizes.size() > 0) {
+      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
+      max_data = max.data<T>();
+    }
+
+    GenPriorBox<T><<<grid, block, 0, stream>>>(
+        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
+        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
+        max_data, min_num, clip);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
+                        ops::PriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 18bb2deb6b5acf626dfb2883a5771d9d195d45c0..1e4a12aac1c5f1c3b7e2e1bc83170de9ad590fc3 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -51,7 +51,7 @@ struct ClipFunctor {
   }
 };
 
-template <typename Place, typename T>
+template <typename T>
 class PriorBoxOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
         int idx = 0;
         for (size_t s = 0; s < min_sizes.size(); ++s) {
           auto min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size / 2.;
-          // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-          // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-
-          idx++;
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            // xmin
+          // priors with different aspect ratios
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
-
-          // rest of priors
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            // xmin
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
             e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
             e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
             e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
             e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
             idx++;
           }
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index d0aa2f9cbadaadf4e7e625628d9db5677d50d277..074fa9e00f2ec531f324ff10113d95144687d500 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
-        ctx.GetPlace());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Param", "(Tensor or SelectedRows) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
+    AddOutput("ParamOut",
+              "(Tensor or SelectedRows, same with Param) "
+              "Output parameter, should share the same memory with Param");
     AddComment(R"DOC(
 
 SGD operator
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 0ad801079400f1830d85a945e57a434a86adeb00..8d2bdf75903b4958e14605781f65c5a214cb5300 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -23,60 +23,97 @@ namespace operators {
 template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-
-      auto p = framework::EigenVector<T>::Flatten(*param);
-      auto g = framework::EigenVector<T>::Flatten(*grad);
-      auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto* lr = learning_rate->data<T>();
-
-      o = p - lr[0] * g;
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    if (param_var->IsType<framework::LoDTensor>()) {
+      const auto *param = ctx.Input<framework::Tensor>("Param");
+      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+
+      // Actually, all tensors are LoDTensor except SelectedRows.
+      if (grad_var->IsType<framework::LoDTensor>()) {
+        param_out->mutable_data<T>(ctx.GetPlace());
+        const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+        auto p = framework::EigenVector<T>::Flatten(*param);
+        auto g = framework::EigenVector<T>::Flatten(*grad);
+        auto o = framework::EigenVector<T>::Flatten(*param_out);
+        auto *lr = learning_rate->data<T>();
+
+        o = p - lr[0] * g;
+      } else if (grad_var->IsType<framework::SelectedRows>()) {
+        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+        // This manual optimization brings difficulty to track data dependency.
+        // It's better to find a more elegant solution.
+        PADDLE_ENFORCE_EQ(param, param_out);
+        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+        // for distributed training, a sparse var may be empty,
+        // just skip updating.
+        if (grad->rows().size() == 0) {
+          return;
+        }
+
+        auto grad_height = grad->height();
+        auto out_dims = param_out->dims();
+        PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
+
+        auto &grad_value = grad->value();
+        auto &grad_rows = grad->rows();
+
+        size_t grad_row_numel = grad_value.numel() / grad_rows.size();
+        PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
+
+        auto *grad_data = grad_value.data<T>();
+        auto *out_data = param_out->data<T>();
+        auto *lr = learning_rate->data<T>();
+        for (size_t i = 0; i < grad_rows.size(); i++) {
+          PADDLE_ENFORCE(grad_rows[i] < grad_height,
+                         "Input rows index should less than height");
+          for (int64_t j = 0; j < grad_row_numel; j++) {
+            out_data[grad_rows[i] * grad_row_numel + j] -=
+                lr[0] * grad_data[i * grad_row_numel + j];
+          }
+        }
+      } else {
+        PADDLE_THROW("Unsupported Variable Type of Grad");
+      }
+    } else if (param_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
+                     "when param "
+                     "is SelectedRows, gradient should also be SelectedRows");
+      const auto &param = param_var->Get<framework::SelectedRows>();
+      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
+      const auto &grad = grad_var->Get<framework::SelectedRows>();
 
       // for distributed training, a sparse var may be empty,
       // just skip updating.
-      if (grad->rows().size() == 0) {
+      if (grad.rows().size() == 0) {
         return;
       }
 
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      size_t param_row_width = param.value().numel() / param.rows().size();
+      size_t grad_row_width = grad.value().numel() / grad.rows().size();
+      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
+                        "param_row should have the same size with grad_row");
 
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-      auto* lr = learning_rate->data<T>();
-      for (size_t i = 0; i < in_rows.size(); i++) {
-        PADDLE_ENFORCE(in_rows[i] < in_height,
+      const auto *lr = learning_rate->data<T>();
+      const auto *grad_data = grad.value().data<T>();
+      auto *out_data = param_out->mutable_value()->data<T>();
+      for (size_t i = 0; i < grad.rows().size(); i++) {
+        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        for (int64_t j = 0; j < in_row_numel; j++) {
-          out_data[in_rows[i] * in_row_numel + j] -=
-              lr[0] * in_data[i * in_row_numel + j];
+        int64_t id_index = param.index(grad.rows()[i]);
+        for (int64_t j = 0; j < grad_row_width; j++) {
+          out_data[id_index * grad_row_width + j] -=
+              lr[0] * grad_data[i * grad_row_width + j];
         }
       }
-
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_THROW("Unsupported Variable Type of Parameter");
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index e2c0f915d96b7746191572fa27b725d90cb6e2e5..6bdefc0f23910c90f3878d8f2634ca6e03c6f736 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+
+#include <string>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -20,6 +23,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+
 namespace paddle {
 namespace operators {
 
@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     auto input_data_type =
         framework::ToDataType(ctx.Input<Tensor>("X")->type());
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                        "float16 can only be used when CUDNN is used");
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
     }
 
     std::string data_format = ctx.Attr<std::string>("data_format");
@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
                                    library_);
   }
 };
+
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index dbd13fd38a33d4068a5b5d47cd92a81293f6e748..0c1f7cef7ab7b66358d80f6f0670e0d07536128c 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(softmax_grad,
+                        ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 686c0889140f0050b37192542ca98e2f3e5f23df..6780b8cc6deca64e9eaefa0b40d309449e730c8c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
 add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
index 78332f90cd96d80cca0cf865f4815aaf18463253..aac882e846309f23f49f68aba805da0857c7fb2d 100644
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/string/printf.h"
 
 #include <ostream>
 #include <sstream>
@@ -20,6 +19,7 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/string/printf.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 7c604e14eb245232ed92f53a00b9bde45c2fbaec..c0d399d078f73743836fc2a0c1d4b1e6b31ecd83 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
     }
 #endif
 
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index e90e3105f0809b3c7507a86fa5a3d61864290fcb..eb541579a136de2a84ecc9773e0c312b405f7e86 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr;
 
 CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index fa9041134d863ebfd8d1e00379da3b92323ae6e3..a41018d350e89881888d5e31089c2b9ecd76f6c0 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -1,22 +1,23 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
 #include <cublas_v2.h>
+#include <cuda.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -34,18 +35,18 @@ extern void *cublas_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    inline cublasStatus_t operator()(Args... args) {                \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
-      std::call_once(cublas_dso_flag,                               \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                           \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    inline cublasStatus_t operator()(Args... args) {                         \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                         \
+      std::call_once(cublas_dso_flag, []() {                                 \
+        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                  \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
@@ -70,6 +71,7 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemm_v2);                \
   __macro(cublasHgemm);                   \
   __macro(cublasSgemmEx);                 \
+  __macro(cublasGemmEx);                  \
   __macro(cublasSgeam_v2);                \
   __macro(cublasDgeam_v2);                \
   __macro(cublasCreate_v2);               \
@@ -89,9 +91,15 @@ extern void *cublas_dso_handle;
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
+  __macro(cublasDgetriBatched);
 
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index c65b060ab46cfcd38292be66dd5f2123f88bae63..f3cd3b2bbedef7c9140c2acddea0732972ff7fa0 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 
 #ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
   return cudnn_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 81acc445bd3803dede158ff09507a72fb6e293ac..24475b62ca2825c45ff7edb39328dece3b822b25 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -30,19 +30,19 @@ extern bool HasCUDNN();
 #ifdef PADDLE_USE_DSO
 
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
-                     &cudnn_dso_handle);                           \
-      EnforceCUDNNLoaded(#__name);                                 \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);           \
+      std::call_once(cudnn_dso_flag, []() {                                \
+        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
+      });                                                                  \
+      EnforceCUDNNLoaded(#__name);                                         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                 \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
   extern struct DynLoad__##__name __name
 
 #else
@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index c1bf88f8cb690861b97686d99d36410143445243..d0d676b9d8ac462900b48246bec43166d04ef97b 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #ifdef PADDLE_WITH_CUPTI
+
 #include <cuda.h>
 #include <cupti.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -36,18 +37,18 @@ extern void *cupti_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
-      std::call_once(cupti_dso_flag,                               \
-                     paddle::platform::dynload::GetCUPTIDsoHandle, \
-                     &cupti_dso_handle);                           \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
+      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);                  \
+      std::call_once(cupti_dso_flag, []() {                                \
+        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
+      });                                                                  \
+      void *p_##__name = dlsym(cupti_dso_handle, #__name);                 \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
+    }                                                                      \
+  };                                                                       \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1b3ff962d6edceb37deb94cc7daead7346d25352..4697fb6cd96770127206bdabeea77e43eb09d1f5 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <curand.h>
 #include <dlfcn.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -25,18 +26,18 @@ namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(Args... args) {                       \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
-      std::call_once(curand_dso_flag,                               \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                           \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    curandStatus_t operator()(Args... args) {                                \
+      typedef curandStatus_t (*curandFunc)(Args...);                         \
+      std::call_once(curand_dso_flag, []() {                                 \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                    \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                  \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e590e81bab51fd9fe12309335522614263d8e21d..3c1ccc7445ed27c711ab250aa223c66ae0da45dc 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
 #include <dlfcn.h>
+
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
 #if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+    if (nullptr == dso_handle) {
       if (dso_path == "libcudnn.dylib") {
         LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
                         "For instance, sudo tar -xzf "
@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     }
   }
 #endif
+
+  return dso_handle;
 }
 
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle,
-                                              bool throw_on_error = true) {
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
+                                               const std::string& dso_name,
+                                               bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
+  void* dso_handle = nullptr;
 
   std::string dlPath = dso_name;
   if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
   } else {
     // search xxx.so from custom path
     dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
       dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
   }
   auto error_msg =
@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       "using the DYLD_LIBRARY_PATH is impossible unless System "
       "Integrity Protection (SIP) is disabled.";
   if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
+    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == dso_handle) {
     LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
   }
+
+  return dso_handle;
 }
 
-void GetCublasDsoHandle(void** dso_handle) {
+void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
 
-void GetCUDNNDsoHandle(void** dso_handle) {
+void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
-                             false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
 }
 
-void GetCUPTIDsoHandle(void** dso_handle) {
+void* GetCUPTIDsoHandle() {
   std::string cupti_path = cupti_lib_path;
   if (!FLAGS_cupti_dir.empty()) {
     cupti_path = FLAGS_cupti_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
 #endif
 }
 
-void GetCurandDsoHandle(void** dso_handle) {
+void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
 }
 
-void GetWarpCTCDsoHandle(void** dso_handle) {
+void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
 }
 
-void GetLapackDsoHandle(void** dso_handle) {
+void* GetLapackDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
 #endif
 }
 
-void GetNCCLDsoHandle(void** dso_handle) {
+void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index b5b9c4af916241c1c7361b506f74563ebcf69b9a..4c85093a43e0e8d75b64c5b29d1ec68db1b44909 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,55 +18,13 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCUDNNDsoHandle(void** dso_handle);
-
-void GetCUPTIDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of NVIDIA nccl
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetNCCLDsoHandle(void** dso_handle);
+void* GetCublasDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetLapackDsoHandle();
+void* GetNCCLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 3edc70c46d03ddcc751e865676928c47fcb48e69..2c40c48ee08497f9a2a414687b9c51d87ba574aa 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,11 +25,6 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index dc78bcb44d3316a1ecee0c8d70dcb4777a9e2de4..d21e29df3cf9b2d78920d8bac41209d200b5ba3a 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <dlfcn.h>
 #include <nccl.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/call_once.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
@@ -28,18 +29,19 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
 
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);          \
+      std::call_once(nccl_dso_flag, []() {                               \
+        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
+      });                                                                \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);                \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index f5ded0eb6b1107c886641e848f5040a7a2d806a5..7fa468370463a51c486b80317f401612930bc72e 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,9 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <dlfcn.h>
-#include <mutex>
-#include "ctc.h"
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "warpctc/include/ctc.h"
 
 namespace paddle {
 namespace platform {
@@ -31,18 +32,18 @@ extern void* warpctc_dso_handle;
  * (for each function) to dynamic load warpctc routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
-      std::call_once(warpctc_dso_flag,                               \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
-                     &warpctc_dso_handle);                           \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
-    }                                                                \
-  };                                                                 \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(warpctc_dso_flag, []() {                                  \
+        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
+      });                                                                      \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);                    \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
+    }                                                                          \
+  };                                                                           \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d303fd6d63f8424c1c88a31eb3fa6f2136e0e430..7b8c29e1e642ec6bb4023afd8c083311b8b31812 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -16,35 +16,35 @@ limitations under the License. */
 
 #include <dlfcn.h>     // for dladdr
 #include <execinfo.h>  // for backtrace
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif               // __GNUC__
+
+#ifdef PADDLE_WITH_CUDA
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_CUDA
+
 #include <iomanip>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
-#include <glog/logging.h>
-
 #ifdef PADDLE_WITH_CUDA
-
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
-
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
 #endif
 
 namespace paddle {
@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#endif  // PADDLE_ONLY_CPU
+#endif  // PADDLE_WITH_CUDA
 
 template <typename T>
 inline void throw_on_error(T e) {
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index bb9a3543ff267dadf3dfee260a320d292a1ba3cb..57d751cc00b5f11f1ba1a3b0c9a6b7ce9e79f586 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GE(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
   try {
     int* a = nullptr;
     PADDLE_ENFORCE_NOT_NULL(a);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 2cf311c7e56a9bbb0bdb0078d5cfefb4bb50018b..e77f768bf9f437a289b16d2ec9597c570b0a9ad2 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+#include <limits>
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(float(*this));
+    return static_cast<int8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(float(*this));
+    return static_cast<uint8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(float(*this));
+    return static_cast<int16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(float(*this));
+    return static_cast<uint16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(float(*this));
+    return static_cast<int32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(float(*this));
+    return static_cast<uint32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(float(*this));
+    return static_cast<int64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(float(*this));
+    return static_cast<uint64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(float(*this));
+    return static_cast<double>(static_cast<float>(*this));
   }
 
  private:
@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
 #else
-  float res = float(float16(a)) + float(float16(b));
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hsub(a, b);
 #else
-  float res = float(float16(a)) - float(float16(b));
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hmul(a, b);
 #else
-  float res = float(float16(a)) * float(float16(b));
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
   float denom = __half2float(b);
   return __float2half(num / denom);
 #else
-  float res = float(float16(a)) / float(float16(b));
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hneg(a);
 #else
-  float res = -float(float16(a));
+  float res = -static_cast<float>(float16(a));
   return half(float16(res));
 #endif
 }
 
-DEVICE inline half& operator+=(half& a, const half& b) {
+DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-DEVICE inline half& operator-=(half& a, const half& b) {
+DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-DEVICE inline half& operator*=(half& a, const half& b) {
+DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-DEVICE inline half& operator/=(half& a, const half& b) {
+DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(a, b);
 #else
-  return float(float16(a)) == float(float16(b));
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
 #endif
 }
 
@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(a, b);
 #else
-  return float(float16(a)) != float(float16(b));
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
 #endif
 }
 
@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b);
 #else
-  return float(float16(a)) < float(float16(b));
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
 #endif
 }
 
@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(a, b);
 #else
-  return float(float16(a)) <= float(float16(b));
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
 #endif
 }
 
@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(a, b);
 #else
-  return float(float16(a)) > float(float16(b));
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
 #endif
 }
 
@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(a, b);
 #else
-  return float(float16(a)) >= float(float16(b));
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
 #endif
 }
 
@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hadd(half(a), half(b)));
 #else
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 #endif
 }
 
@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hsub(half(a), half(b)));
 #else
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 #endif
 }
 
@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return float16(__hmul(half(a), half(b)));
 #else
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 #endif
 }
 
@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
   float denom = __half2float(half(b));
   return float16(num / denom);
 #else
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 #endif
 }
 
@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
 #endif
 }
 
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(half(a), half(b));
 #else
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 #endif
 }
 
@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(half(a), half(b));
 #else
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 #endif
 }
 
@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(half(a), half(b));
 #else
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 #endif
 }
 
@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(half(a), half(b));
 #else
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 #endif
 }
 
@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(half(a), half(b));
 #else
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 #endif
 }
 
@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(half(a), half(b));
 #else
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 #endif
 }
 
@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) {
   return res;
 }
 
-inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) {
 // Arithmetic operators for float16, software emulated on other CPU
 #else
 inline float16 operator+(const float16& a, const float16& b) {
-  return float16(float(a) + float(b));
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 }
 
 inline float16 operator-(const float16& a, const float16& b) {
-  return float16(float(a) - float(b));
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 }
 
 inline float16 operator*(const float16& a, const float16& b) {
-  return float16(float(a) * float(b));
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 }
 
 inline float16 operator/(const float16& a, const float16& b) {
-  return float16(float(a) / float(b));
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 }
 
 inline float16 operator-(const float16& a) {
@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) {
   return res;
 }
 
-inline float16& operator+=(float16& a, const float16& b) {
-  a = float16(float(a) + float(b));
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
   return a;
 }
 
-inline float16& operator-=(float16& a, const float16& b) {
-  a = float16(float(a) - float(b));
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
   return a;
 }
 
-inline float16& operator*=(float16& a, const float16& b) {
-  a = float16(float(a) * float(b));
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
   return a;
 }
 
-inline float16& operator/=(float16& a, const float16& b) {
-  a = float16(float(a) / float(b));
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
   return a;
 }
 
 inline bool operator==(const float16& a, const float16& b) {
-  return float(a) == float(b);
+  return static_cast<float>(a) == static_cast<float>(b);
 }
 
 inline bool operator!=(const float16& a, const float16& b) {
-  return float(a) != float(b);
+  return static_cast<float>(a) != static_cast<float>(b);
 }
 
 inline bool operator<(const float16& a, const float16& b) {
-  return float(a) < float(b);
+  return static_cast<float>(a) < static_cast<float>(b);
 }
 
 inline bool operator<=(const float16& a, const float16& b) {
-  return float(a) <= float(b);
+  return static_cast<float>(a) <= static_cast<float>(b);
 }
 
 inline bool operator>(const float16& a, const float16& b) {
-  return float(a) > float(b);
+  return static_cast<float>(a) > static_cast<float>(b);
 }
 
 inline bool operator>=(const float16& a, const float16& b) {
-  return float(a) >= float(b);
+  return static_cast<float>(a) >= static_cast<float>(b);
 }
 #endif
 
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(half(a));
@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct numeric_limits<paddle::platform::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::float16(min)() {
+    return paddle::platform::raw_uint16_to_float16(0x400);
+  }
+  static paddle::platform::float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  static paddle::platform::float16(max)() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  static paddle::platform::float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  static paddle::platform::float16 round_error() {
+    return paddle::platform::float16(0.5);
+  }
+  static paddle::platform::float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  static paddle::platform::float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 denorm_min() {
+    return paddle::platform::raw_uint16_to_float16(0x1);
+  }
+};
+
 }  // namespace std
 
 namespace Eigen {
+
+using float16 = paddle::platform::float16;
+
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
+
 namespace numext {
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isnan)(const float16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isinf)(const float16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
 }  // namespace numext
+
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4..aaebeb1353a13ab16fcf98f10da59d41fd2f5b48 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
 
-#include "gflags/gflags.h"
+#include <algorithm>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t &available, size_t &total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+void GpuMemoryUsage(size_t *available, size_t *total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(available, total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
 
@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
 
   // Reserve the rest for page tables, etc.
   return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
   size_t reserving = static_cast<size_t>(0.05 * total);
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index fa469fa77f5ca780da153cc87da8d04f239711f3..36345e17406e22970806fa274d5a73a703517c43 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,10 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Environment variable: fraction of GPU memory to use on each device.
-const std::string kEnvFractionGpuMemoryToUse =
-    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
-
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //! Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t &available, size_t &total);
+void GpuMemoryUsage(size_t *available, size_t *total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index d0bdcb0da5177f9f8ad517787e612f1b98b3fbb4..ad54a878996bd36f2d714f6554b44c89dae3fd0c 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <iostream>
+#include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index ada69ea4a425f70dc085ad9046bb6b930136803d..787925d9f8800b49de5b8b642304605ef4087d1e 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -15,4 +15,6 @@ if(WITH_PYTHON)
       target_link_libraries(paddle_pybind rt)
     endif(NOT APPLE AND NOT ANDROID)
   endif(WITH_AMD_GPU)
+
+  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 6657b25ed2443c1ac9cb0a09098968d3181fc6ba..3f28e616494ad1322708ad6403aaf50b22d724e6 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "const_value.h"
+#include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace pybind {
 
-void BindConstValue(pybind11::module& m) {
-  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m.def("kTempVarName", [] { return framework::kTempVarName; });
-  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+void BindConstValue(pybind11::module* m) {
+  m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m->def("kTempVarName", [] { return framework::kTempVarName; });
+  m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
index 79e71e039dea6585aaf8193f1417c6ab3fbf6f76..2fab3160d1d95af7f6a49c472c2e211c19e67cac 100644
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
-extern void BindConstValue(pybind11::module& m);
+
+void BindConstValue(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 4bd3ecf728dedaf74a554f77b114065f2d515786..08a2f185e117718d07ba984f76dfe5bf8229c33c 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-void BindException(pybind11::module& m) {
-  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+void BindException(pybind11::module* m) {
+  static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
   pybind11::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) std::rethrow_exception(p);
@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) {
     }
   });
 
-  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+  m->def("__unittest_throw_exception__",
+         [] { PADDLE_THROW("test exception"); });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index bc6b0c067978959d4cdafec51db9574927b34b21..5e054267361f2c62b3ad36581be0ad17ce0718de 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
+
 namespace paddle {
 namespace pybind {
 
-extern void BindException(pybind11::module& m);
+void BindException(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 45a64f43846e79c27295e52c59dca6bdfaa120a3..2fe829036386086075a7f6ad0b9348a9e8c5e85a 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/pybind/protobuf.h"
+
 #include <deque>
 #include <iostream>
+#include <string>
+#include <tuple>
+
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>>
 namespace paddle {
 namespace pybind {
 
-using namespace paddle::framework;  // NOLINT
+namespace pd = paddle::framework;
 
 template <typename T>
-static py::bytes SerializeMessage(T &self) {
+static pybind11::bytes SerializeMessage(
+    T &self) {  // NOLINT due to pybind11 convention.
   // Check IsInitialized in Python
   std::string retv;
   PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) {
 }
 
 // Bind Methods
-void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def(py::init<>())
+void BindProgramDesc(pybind11::module *m) {
+  pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
+      .def(pybind11::init<>())
       .def("__init__",
-           [](ProgramDesc &self, const ProgramDesc &other) {
-             new (&self) ProgramDesc(other);
+           [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
+             new (&self) pd::ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDesc &self, const py::bytes &binary_str) {
+           [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDesc(str);
+             new (&self) pd::ProgramDesc(str);
            })
-      .def("append_block", &ProgramDesc::AppendBlock,
-           py::return_value_policy::reference)
+      .def("append_block", &pd::ProgramDesc::AppendBlock,
+           pybind11::return_value_policy::reference)
       .def("append_backward",
-           [](ProgramDesc &program_desc, const VarDesc &target,
+           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
               const std::unordered_set<std::string> &no_grad_vars) {
-             ParamGradInfoMap param_grad_map =
+             pd::ParamGradInfoMap param_grad_map =
                  AppendBackward(program_desc, target, no_grad_vars);
              std::unordered_map<
                  std::string, std::tuple<std::string /* grad_var_name */,
@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) {
              }
              return retv;
            })
-      .def("block", &ProgramDesc::MutableBlock,
-           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("block", &pd::ProgramDesc::MutableBlock,
+           pybind11::return_value_policy::reference)
+      .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDesc &program_desc, const std::string &data) {
-             proto::ProgramDesc *desc = program_desc.Proto();
+           [](pd::ProgramDesc &program_desc, const std::string &data) {
+             pd::proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
            });
 }
 
-void BindBlockDesc(py::module &m) {
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDesc::Parent)
-      .def("get_forward_block_idx", &BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &BlockDesc::SetForwardBlockID)
-      .def("append_op", &BlockDesc::AppendOp,
-           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDesc::PrependOp,
-           py::return_value_policy::reference)
-      .def("insert_op", &BlockDesc::InsertOp,
-           py::return_value_policy::reference)
-      .def("remove_op", &BlockDesc::RemoveOp)
+void BindBlockDesc(pybind11::module *m) {
+  pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
+      .def_property_readonly("id", &pd::BlockDesc::ID)
+      .def_property_readonly("parent", &pd::BlockDesc::Parent)
+      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
+      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("append_op", &pd::BlockDesc::AppendOp,
+           pybind11::return_value_policy::reference)
+      .def("prepend_op", &pd::BlockDesc::PrependOp,
+           pybind11::return_value_policy::reference)
+      .def("insert_op", &pd::BlockDesc::InsertOp,
+           pybind11::return_value_policy::reference)
+      .def("remove_op", &pd::BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("has_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("rename_var",
-           [](BlockDesc &self, const py::bytes &byte_name,
-              const py::bytes &byte_name_new) {
+           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
+              const pybind11::bytes &byte_name_new) {
              std::string name = byte_name;
              std::string new_name = byte_name_new;
              self.RenameVar(name, new_name);
            })
       .def("has_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVarRecursive(name);
            })
       .def("find_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("find_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVarRecursive(name);
            },
-           py::return_value_policy::reference)
-      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
-      .def("op_size", &BlockDesc::OpSize)
-      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+           pybind11::return_value_policy::reference)
+      .def("remove_var",
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+             std::string name = byte_name;
+             return self.RemoveVar(name);
+           },
+           pybind11::return_value_policy::reference)
+      .def("all_vars", &pd::BlockDesc::AllVars,
+           pybind11::return_value_policy::reference)
+      .def("op_size", &pd::BlockDesc::OpSize)
+      .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
 }
 
-void BindVarDsec(py::module &m) {
-  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+void BindVarDsec(pybind11::module *m) {
+  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
       .def("name",
-           [](VarDesc &self) {
-             py::bytes name = self.Name();
+           [](pd::VarDesc &self) {
+             pybind11::bytes name = self.Name();
              return name;
            },
-           py::return_value_policy::reference)
-      .def("set_name", &VarDesc::SetName)
-      .def("set_shape", &VarDesc::SetShape)
-      .def("set_shapes", &VarDesc::SetShapes)
-      .def("set_dtype", &VarDesc::SetDataType)
-      .def("set_dtypes", &VarDesc::SetDataTypes)
-      .def("set_capacity", &VarDesc::SetCapacity)
-      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
-      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
-      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLoDLevel)
-      .def("lod_levels", &VarDesc::GetLoDLevels,
-           py::return_value_policy::reference)
-      .def("set_lod_level", &VarDesc::SetLoDLevel)
-      .def("set_lod_levels", &VarDesc::SetLoDLevels)
-      .def("type", &VarDesc::GetType)
-      .def("set_type", &VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDesc>)
-      .def("persistable", &VarDesc::Persistable)
-      .def("set_persistable", &VarDesc::SetPersistable);
+           pybind11::return_value_policy::reference)
+      .def("set_name", &pd::VarDesc::SetName)
+      .def("set_shape", &pd::VarDesc::SetShape)
+      .def("set_shapes", &pd::VarDesc::SetShapes)
+      .def("set_dtype", &pd::VarDesc::SetDataType)
+      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
+      .def("set_capacity", &pd::VarDesc::SetCapacity)
+      .def("shape", &pd::VarDesc::GetShape,
+           pybind11::return_value_policy::reference)
+      .def("shapes", &pd::VarDesc::GetShapes,
+           pybind11::return_value_policy::reference)
+      .def("dtype", &pd::VarDesc::GetDataType,
+           pybind11::return_value_policy::reference)
+      .def("dtypes", &pd::VarDesc::GetDataTypes,
+           pybind11::return_value_policy::reference)
+      .def("lod_level", &pd::VarDesc::GetLoDLevel)
+      .def("lod_levels", &pd::VarDesc::GetLoDLevels,
+           pybind11::return_value_policy::reference)
+      .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
+      .def("type", &pd::VarDesc::GetType)
+      .def("set_type", &pd::VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<pd::VarDesc>)
+      .def("persistable", &pd::VarDesc::Persistable)
+      .def("set_persistable", &pd::VarDesc::SetPersistable);
 
-  py::enum_<proto::VarType::Type>(var_desc, "VarType", "")
-      .value("BOOL", proto::VarType::BOOL)
-      .value("INT16", proto::VarType::INT16)
-      .value("INT32", proto::VarType::INT32)
-      .value("INT64", proto::VarType::INT64)
-      .value("FP16", proto::VarType::FP16)
-      .value("FP32", proto::VarType::FP32)
-      .value("FP64", proto::VarType::FP64)
-      .value("LOD_TENSOR", proto::VarType::LOD_TENSOR)
-      .value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH)
-      .value("FETCH_LIST", proto::VarType::FETCH_LIST)
-      .value("STEP_SCOPES", proto::VarType::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", proto::VarType::CHANNEL)
-      .value("PLACE_LIST", proto::VarType::PLACE_LIST)
-      .value("READER", proto::VarType::READER)
-      .value("RAW", proto::VarType::RAW);
+  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
+      .value("BOOL", pd::proto::VarType::BOOL)
+      .value("INT16", pd::proto::VarType::INT16)
+      .value("INT32", pd::proto::VarType::INT32)
+      .value("INT64", pd::proto::VarType::INT64)
+      .value("FP16", pd::proto::VarType::FP16)
+      .value("FP32", pd::proto::VarType::FP32)
+      .value("FP64", pd::proto::VarType::FP64)
+      .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
+      .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
+      .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
+      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
+      .value("CHANNEL", pd::proto::VarType::CHANNEL)
+      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
+      .value("READER", pd::proto::VarType::READER)
+      .value("RAW", pd::proto::VarType::RAW);
 }
 
-void BindOpDesc(py::module &m) {
-  py::enum_<proto::AttrType>(m, "AttrType", "")
-      .value("INT", proto::AttrType::INT)
-      .value("INTS", proto::AttrType::INTS)
-      .value("FLOAT", proto::AttrType::FLOAT)
-      .value("FLOATS", proto::AttrType::FLOATS)
-      .value("STRING", proto::AttrType::STRING)
-      .value("STRINGS", proto::AttrType::STRINGS)
-      .value("BOOL", proto::AttrType::BOOLEAN)
-      .value("BOOLS", proto::AttrType::BOOLEANS)
-      .value("BLOCK", proto::AttrType::BLOCK);
+void BindOpDesc(pybind11::module *m) {
+  pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
+      .value("INT", pd::proto::AttrType::INT)
+      .value("INTS", pd::proto::AttrType::INTS)
+      .value("FLOAT", pd::proto::AttrType::FLOAT)
+      .value("FLOATS", pd::proto::AttrType::FLOATS)
+      .value("STRING", pd::proto::AttrType::STRING)
+      .value("STRINGS", pd::proto::AttrType::STRINGS)
+      .value("BOOL", pd::proto::AttrType::BOOLEAN)
+      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
+      .value("BLOCK", pd::proto::AttrType::BLOCK);
 
-  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
-      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
-           py::return_value_policy::reference)
-      .def("copy_from", &OpDesc::CopyFrom)
-      .def("type", &OpDesc::Type)
-      .def("set_type", &OpDesc::SetType)
-      .def("input", &OpDesc::Input)
-      .def("input_names", &OpDesc::InputNames)
-      .def("output", &OpDesc::Output)
-      .def("output_names", &OpDesc::OutputNames)
-      .def("set_input", &OpDesc::SetInput)
-      .def("set_output", &OpDesc::SetOutput)
-      .def("input_arg_names", &OpDesc::InputArgumentNames)
-      .def("output_arg_names", &OpDesc::OutputArgumentNames)
-      .def("rename_input", &OpDesc::RenameInput)
-      .def("rename_output", &OpDesc::RenameOutput)
-      .def("has_attr", &OpDesc::HasAttr)
-      .def("attr_type", &OpDesc::GetAttrType)
-      .def("attr_names", &OpDesc::AttrNames)
-      .def("set_attr", &OpDesc::SetAttr)
-      .def("attr", &OpDesc::GetAttr)
-      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
+           pybind11::return_value_policy::reference)
+      .def("copy_from", &pd::OpDesc::CopyFrom)
+      .def("type", &pd::OpDesc::Type)
+      .def("set_type", &pd::OpDesc::SetType)
+      .def("input", &pd::OpDesc::Input)
+      .def("input_names", &pd::OpDesc::InputNames)
+      .def("output", &pd::OpDesc::Output)
+      .def("output_names", &pd::OpDesc::OutputNames)
+      .def("set_input", &pd::OpDesc::SetInput)
+      .def("set_output", &pd::OpDesc::SetOutput)
+      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
+      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
+      .def("rename_input", &pd::OpDesc::RenameInput)
+      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("has_attr", &pd::OpDesc::HasAttr)
+      .def("attr_type", &pd::OpDesc::GetAttrType)
+      .def("attr_names", &pd::OpDesc::AttrNames)
+      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("attr", &pd::OpDesc::GetAttr)
+      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
       .def("set_serialized_attr",
-           [](OpDesc &self, const std::string &name,
-              const py::bytes &seriralized) {
+           [](pd::OpDesc &self, const std::string &name,
+              const pybind11::bytes &seriralized) {
              std::string ser(seriralized);
              self.SetAttr(name, ser);
            })
-      .def("block_attr", &OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDesc::InferShape)
-      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>)
-      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+      .def("block_attr", &pd::OpDesc::GetBlockAttr)
+      .def("check_attrs", &pd::OpDesc::CheckAttrs)
+      .def("infer_shape", &pd::OpDesc::InferShape)
+      .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
+      .def("block", &pd::OpDesc::Block,
+           pybind11::return_value_policy::reference);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
index d0dc8936b3df50ca12315f113fbb36b0f98bb53f..e7370672a88fcf9238cc88c6aae65c6ee643746b 100644
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <Python.h>
+
 #include <fstream>
 #include <vector>
+
 #include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
 
-void BindProgramDesc(py::module& m);
-void BindBlockDesc(py::module& m);
-void BindVarDsec(py::module& m);
-void BindOpDesc(py::module& m);
+void BindProgramDesc(pybind11::module* m);
+void BindBlockDesc(pybind11::module* m);
+void BindVarDsec(pybind11::module* m);
+void BindOpDesc(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b0a3f06a8871b1dc8c6c9d7231dfe2c9764ade3f..748ad75a99ea4955730327a10ae8468a107fed0a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <Python.h>
+#include <algorithm>
+#include <map>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "paddle/fluid/pybind/protobuf.h"
 
-#include <mutex>  // for call_once
-#include <unordered_map>
 #include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
@@ -32,7 +38,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cond_op.h"
 #include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) {
   // not cause namespace pollution.
   using namespace paddle::framework;  // NOLINT
 
-  BindException(m);
+  BindException(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
       .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle.
 #else
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
-                  });
+                  })
+          .def_static("create",
+                [](paddle::platform::CUDAPinnedPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                  PADDLE_THROW(
+                        "CUDAPinnedPlace is not supported in CPU device.");
+#else
+                  return new paddle::platform::CUDAPinnedDeviceContext(place);
+#endif
+                });;
 // clang-format on
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
+  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("set_place",
@@ -339,7 +372,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
-           });
+           })
+      .def("set_place", [](platform::Place &self,
+                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
+        self = cuda_pinned_place;
+      });
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPinnedPlace &place) {
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
-  BindProgramDesc(m);
-  BindBlockDesc(m);
-  BindVarDsec(m);
-  BindOpDesc(m);
-  BindConstValue(m);
+  BindProgramDesc(&m);
+  BindBlockDesc(&m);
+  BindVarDsec(&m);
+  BindOpDesc(&m);
+  BindConstValue(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -511,7 +553,7 @@ All parameter, weight, gradient are variables in Paddle.
            })
       .def("run", &ParallelExecutor::Run);
 
-  BindRecordIOWriter(m);
+  BindRecordIOWriter(&m);
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 16f8bfb1a2e3a840670594d3cc2970e690dce891..0644d91425af1a1ac9363b1dec9e317689331fcb 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/recordio.h"
+
 #include <fstream>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/recordio/writer.h"
 
 namespace paddle {
 namespace pybind {
 
+namespace {
+
 class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
@@ -49,8 +55,10 @@ class RecordIOWriter {
   recordio::Writer writer_;
 };
 
-void BindRecordIOWriter(py::module& m) {
-  py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
+}  // namespace
+
+void BindRecordIOWriter(py::module* m) {
+  py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
   py::enum_<recordio::Compressor>(writer, "Compressor", "")
       .value("Snappy", recordio::Compressor::kSnappy)
       .value("NoCompress", recordio::Compressor::kNoCompress);
diff --git a/paddle/fluid/pybind/recordio.h b/paddle/fluid/pybind/recordio.h
index 60e6a9e8595614b38375fca8c13d520739af9aaf..2555f9b719af8f73fbac10d92b890afd99fac290 100644
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
@@ -21,6 +21,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-extern void BindRecordIOWriter(py::module& m);
+void BindRecordIOWriter(py::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 6f8c597f8e610594851c318c122563523e4e7ea6..4a9dbd324c90380e784cc9457845fabd858585be 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <Python.h>
 #include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -21,12 +24,8 @@ limitations under the License. */
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
-
 namespace pybind {
-
 namespace details {
 
 template <bool less, size_t I, typename... ARGS>
@@ -34,16 +33,16 @@ struct CastToPyBufferImpl;
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
+    return pybind11::buffer_info();
   }
 };
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 
       if (std::type_index(typeid(CUR_TYPE)) ==
           std::type_index(typeid(platform::float16))) {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               "e", /* np.dtype('e') == np.float16 */
-                               (size_t)framework::arity(dst_tensor.dims()),
-                               dims_outside, strides);
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            "e", /* np.dtype('e') == np.float16 */
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
       } else {
-        return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                               py::format_descriptor<CUR_TYPE>::format(),
-                               (size_t)framework::arity(dst_tensor.dims()),
-                               dims_outside, strides);
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            pybind11::format_descriptor<CUR_TYPE>::format(),
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
       }
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 
 }  // namespace details
 
-inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
       details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
                                   platform::float16>()(tensor);
@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
 }
 
 template <typename T>
-T TensorGetElement(framework::Tensor &self, size_t offset) {
+T TensorGetElement(const framework::Tensor &self, size_t offset) {
   if (platform::is_cpu_place(self.place())) {
     return self.data<T>()[offset];
   } else {
@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
 
 // TODO(dzhwinter) : fix the redundent Tensor allocate and free
 template <typename T>
-void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self.place())) {
+void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
+  if (platform::is_gpu_place(self->place())) {
     std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
     dst->data<T>()[offset] = elem;
-    framework::TensorCopy(*dst.get(), self.place(), &self);
+    framework::TensorCopy(*dst.get(), self->place(), self);
 
-  } else if (platform::is_cpu_place(self.place())) {
-    self.data<T>()[offset] = elem;
+  } else if (platform::is_cpu_place(self->place())) {
+    self->data<T>()[offset] = elem;
   }
 }
 
 template <typename T>
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
   std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
 }
 
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray(
 }
 
 template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<platform::float16>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto dev_ctx =
@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray(
                                    sizeof(uint16_t) * array.size(),
                                    cudaMemcpyHostToDevice, dev_ctx->stream());
 }
+
+template <typename T>
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
 #endif
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py_test.cc b/paddle/fluid/pybind/tensor_py_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ae1d65833b1097bf69befe05884cab1317a89
--- /dev/null
+++ b/paddle/fluid/pybind/tensor_py_test.cc
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/tensor_py.h"
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(TensorPy, CastToPyBufferImpl) {
+  typedef int ElemType;
+
+  paddle::framework::Tensor t;
+  auto d = paddle::framework::make_ddim({1, 2, 3});
+  int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
+  for (int i = 0; i < paddle::framework::product(d); ++i) {
+    p[i] = i;
+  }
+
+  pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
+  EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
+  EXPECT_EQ(bi.ndim, static_cast<size_t>(3));  // 3-dimensional as d.
+  EXPECT_EQ(bi.shape.size(), 3U);              // as Dim d.
+  EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
+  EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
+  EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
+  EXPECT_EQ(bi.strides.size(), 3U);  // 3-dimensional as d.
+  EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
+  EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
+}
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 187a6a4ea7bd9d3a8ae48fa262e18f71b0f7d20d..e7ebbba452c5c37113f0962e459da65c66b70873 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,11 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <algorithm>
 #include <memory>
 #include <sstream>
+
 #include "paddle/fluid/platform/enforce.h"
-#include "snappystream.hpp"
-#include "zlib.h"
+#include "snappy_stream/include/snappystream.hpp"
+#include "zlib/include/zlib.h"
 
 namespace paddle {
 namespace recordio {
@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
  * Copy stream in to another stream
  */
 static void PipeStream(std::istream& in, std::ostream& os) {
-  ReadStreamByBuf(
-      in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
+  ReadStreamByBuf(in, 0,
+                  [&os](const char* buf, size_t len) { os.write(buf, len); });
 }
 
 /**
@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
 static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
   uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
   ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
-    crc = static_cast<uint32_t>(crc32(
-        crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
+    crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
+                                      static_cast<uInt>(len)));
   });
   return crc;
 }
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index bf20ebd455c26ddeebeeea8db04cf7103b0c085f..71a1556a33bfa5c937d6a799d2818cd5a5ef2094 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -24,7 +24,7 @@ namespace recordio {
 
 // A Chunk contains the Header and optionally compressed records.
 class Chunk {
-public:
+ public:
   Chunk() : num_bytes_(0) {}
   void Add(const std::string& buf) {
     num_bytes_ += buf.size();
@@ -46,7 +46,7 @@ public:
 
   bool Empty() const { return records_.empty(); }
 
-private:
+ private:
   std::vector<std::string> records_;
   // sum of record lengths in bytes.
   size_t num_bytes_;
diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
index 1f0e36a14d373ca96167199d4582bc8f17290ae8..98ca99b9a018db2da9aa563741ff3cf30461c4ce 100644
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -18,29 +18,27 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Chunk, SaveLoad) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kNoCompress);
+  ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
   ss.seekg(0);
   ch.Parse(ss);
   ASSERT_EQ(ch.NumBytes(), 10U);
 }
 
 TEST(Chunk, Compressor) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kSnappy);
+  ch.Write(ss, paddle::recordio::Compressor::kSnappy);
   std::stringstream ss2;
-  ch.Write(ss2, Compressor::kNoCompress);
+  ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
   ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
 
   ch.Clear();
diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h
index 9200ac090de4514bef3704ac502039222eef2284..245425990b93a90d7ac6b233cff54feb48308d48 100644
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
 
 // Header is the metadata of Chunk
 class Header {
-public:
+ public:
   Header();
   Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
 
@@ -51,7 +51,7 @@ public:
   Compressor CompressType() const { return compressor_; }
   uint32_t CompressSize() const { return compress_size_; }
 
-private:
+ private:
   uint32_t num_records_;
   uint32_t checksum_;
   Compressor compressor_;
diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/recordio/header_test.cc
index a7d627c3eb4a7af1954795f77e5f24739edadae8..00f1887dc5e1188829ef4cd42754d161f041656d 100644
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
@@ -18,14 +18,12 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Recordio, ChunkHead) {
-  Header hdr(0, 1, Compressor::kGzip, 3);
+  paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
   std::stringstream ss;
   hdr.Write(ss);
   ss.seekg(0, std::ios::beg);
-  Header hdr2;
+  paddle::recordio::Header hdr2;
   hdr2.Parse(ss);
   EXPECT_TRUE(hdr == hdr2);
 }
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index c22281dc97e05173ad76ce76959833b92f11c4ee..88b4d4001bc1b6dc935a9aabc2db5edfb55a60e4 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -13,10 +13,14 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/scanner.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace recordio {
+
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
     : stream_(std::move(stream)) {
   Reset();
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index f3f17b69f195ddd92f5a39ead9755a7b8e2dd329..34f1b0c78d6b5af6072a993579e1866d38c6d009 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -16,12 +16,15 @@
 
 #include <fstream>
 #include <memory>
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
+
 namespace paddle {
 namespace recordio {
 
 class Scanner {
-public:
+ public:
   explicit Scanner(std::unique_ptr<std::istream>&& stream);
 
   explicit Scanner(const std::string& filename);
@@ -32,7 +35,7 @@ public:
 
   bool HasNext() const;
 
-private:
+ private:
   std::unique_ptr<std::istream> stream_;
   Chunk cur_chunk_;
   size_t offset_;
diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc
index 196d66edff8cc6000afcd74fb945c05dcab7106a..8046f4ff7896c897ebe1de2e2bb231cad5a0e410 100644
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
@@ -12,9 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/recordio/writer.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace recordio {
+
 void Writer::Write(const std::string& record) {
   cur_chunk_.Add(record);
   if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h
index 0c478d507547b10b8ebaaf5e512557a5c8c13e65..ac7e50ee90e6e8671d68e0d8065e0cf06c819ad0 100644
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
@@ -11,16 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
+
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
 namespace paddle {
 namespace recordio {
 
 class Writer {
-public:
-  Writer(std::ostream* sout,
-         Compressor compressor,
+ public:
+  Writer(std::ostream* sout, Compressor compressor,
          size_t max_num_records_in_chunk = 1000)
       : stream_(*sout),
         max_num_records_in_chunk_(max_num_records_in_chunk),
@@ -32,7 +33,7 @@ public:
 
   ~Writer();
 
-private:
+ private:
   std::ostream& stream_;
   size_t max_num_records_in_chunk_;
   Chunk cur_chunk_;
diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc
index 7e764f0d9439709ad101af2b8864dc0158bd359b..6583df21a20e9e034adc14b1d3eeb136899d659e 100644
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
-
 #include <sstream>
+#include <string>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
@@ -66,4 +67,4 @@ TEST(WriterScanner, TinyChunk) {
     ASSERT_EQ(scanner.Next(), "DEFG");
     ASSERT_FALSE(scanner.HasNext());
   }
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/string/.clang-format b/paddle/fluid/string/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/fluid/string/piece.cc b/paddle/fluid/string/piece.cc
index 454f5d8d38c5f02598cddaab555334a1e8a398da..8e8cfb0e91389490895835ed09ef36adf756d3ca 100644
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "piece.h"
+#include "paddle/fluid/string/piece.h"
 
 #include <string.h>
 
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 693cf9d6dfeea0735801e64fe74b9770c258c553..062095a1c3e977c0bcc89346ead765acb023bcf7 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -71,6 +71,8 @@
 
 #include <iostream>
 #include <sstream>
+#include <string>
+
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 
 namespace paddle {
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index b6a60c8d6b7f15f8e5572cf5bb1e7f04ee1c1598..678029f93534ab374bd29083f8991d632ccdd5a1 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -11,7 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "printf.h"
+
+#include "paddle/fluid/string/printf.h"
 
 #include <string>
 
@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
   std::string weekday = "Wednesday";
   const char* month = "July";
   size_t day = 27;
-  long hour = 14;
+  int hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/fluid/string/to_string_test.cc
index 8fc293af0e473994ac13f6615d3f6195c8c5f04c..1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee 100644
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
 }
 
 TEST(to_string, normal) {
-  using namespace paddle::string;
+  using paddle::string::to_string;
   ASSERT_EQ("10", to_string(10));
   ASSERT_EQ("abc", to_string("abc"));
   ASSERT_EQ("1.2", to_string(1.2));
 }
 
 TEST(to_string, user_defined) {
-  using namespace paddle::string;
   UserDefinedClass instance;
-  ASSERT_EQ(kOutputString, to_string(instance));
+  ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
 }
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 9839375c22e2c64e002255752c22224fc6bbb24e..9d7cad7584d1defefe38bdd4d041b98bd9e45bf0 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -14,6 +14,11 @@ function(gserver_test TARGET)
       COMMAND ${TARGET})
 endfunction()
 
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
+
 gserver_test(test_LayerGrad)
 gserver_test(test_CRFLayerGrad)
 gserver_test(test_CrossEntropyOverBeamGrad)
@@ -31,12 +36,12 @@ gserver_test(test_Upsample)
 
 set(PYTHON_PATH 
    ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
 function(gserver_test_with_python TARGET)
   add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
   add_test(NAME ${TARGET}
     COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 
 gserver_test_with_python(test_PyDataProvider2)
@@ -57,7 +62,7 @@ if(WITH_MKLDNN)
         LayerGradUtil.cpp)
     add_test(NAME test_MKLDNN
         COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 
 ############### test_WarpCTCLayer #######################
@@ -66,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
         test_WarpCTCLayer.cpp)
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 endif()
 
 if(NOT MOBILE_INFERENCE)
@@ -84,15 +89,15 @@ if(NOT MOBILE_INFERENCE)
     endif()
     add_test(NAME test_NetworkCompare
         COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
 
     ############ test_CompareSparse ################
     add_unittest_without_exec(test_CompareSparse
         test_CompareSparse.cpp)
     if(NOT ON_TRAVIS)
       add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
                 ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
     endif()
 endif()
diff --git a/paddle/gserver/tests/test_Upsample.cpp b/paddle/gserver/tests/test_Upsample.cpp
index 9d6fa1d130c74c3789d21879457613eb1bc0935f..39b902fcc75e71007f855e4e258e54ed8d40f16b 100644
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ b/paddle/gserver/tests/test_Upsample.cpp
@@ -20,10 +20,8 @@ limitations under the License. */
 #include "paddle/math/MathUtils.h"
 #include "paddle/testing/TestUtil.h"
 
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
                    const string& poolType) {
   (*config).biasSize = 0;
   (*config).layerConfig.set_type("pool");
@@ -42,21 +40,23 @@ void setPoolConfig(TestConfig* config,
   pool->set_stride(sw);
   pool->set_stride_y(sh);
 
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int ow =
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
   pool->set_output_x(ow);
   pool->set_output_y(oh);
 }
 
-LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
-                           const string& poolType,
-                           bool use_gpu,
-                           real* tempGradData) {
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
+                                   const string& poolType,
+                                   bool use_gpu,
+                                   real* tempGradData) {
   /* prepare maxPoolWithMaskLayer */
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 128, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
+  paddle::TestConfig config;
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
 
   pool->set_img_size(8);
   pool->set_img_size_y(8);
@@ -66,9 +66,9 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
 
   config.layerConfig.set_name("MaxPoolWithMask");
 
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
+  std::vector<paddle::DataLayerPtr> dataLayers;
+  paddle::LayerMap layerMap;
+  vector<paddle::Argument> datas;
 
   initDataLayer(config,
                 &dataLayers,
@@ -82,20 +82,20 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
   dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
 
   FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
+  std::vector<paddle::ParameterPtr> parameters;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
   initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
 
   /* prepare the upsample layer */
-  LayerConfig upsampleLayerConfig;
+  paddle::LayerConfig upsampleLayerConfig;
   upsampleLayerConfig.set_type("upsample");
-  LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
   upsampleLayerConfig.add_inputs();
 
-  UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
   upsampleConfig->set_scale(2);
-  ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
   imageConfig->set_channels(2);
   imageConfig->set_img_size(4);
   imageConfig->set_img_size_y(4);
@@ -103,17 +103,18 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
   upsampleLayerConfig.set_name("upsample");
 
   for (size_t i = 0; i < 2; i++) {
-    LayerInputConfig& inputTemp = *(upsampleLayerConfig.mutable_inputs(i));
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
     inputTemp.set_input_layer_name("MaxPoolWithMask");
   }
 
-  LayerPtr upsampleLayer;
-  ParameterMap parameterMap;
-  upsampleLayer = Layer::create(upsampleLayerConfig);
+  paddle::LayerPtr upsampleLayer;
+  paddle::ParameterMap parameterMap;
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
   layerMap[upsampleLayerConfig.name()] = upsampleLayer;
   upsampleLayer->init(layerMap, parameterMap);
   upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(PASS_GC);
+  upsampleLayer->forward(paddle::PASS_GC);
   upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
   upsampleLayer->backward();
 
@@ -122,31 +123,31 @@ LayerPtr doOneUpsampleTest(MatrixPtr& inputMat,
 
 TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
   bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr inputGPUMat;
-  MatrixPtr tempGradMat;
+  paddle::MatrixPtr inputMat;
+  paddle::MatrixPtr inputGPUMat;
+  paddle::MatrixPtr tempGradMat;
 
-  inputMat = Matrix::create(1, 128, false, useGpu);
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
   inputMat->randomizeUniform();
 
-  tempGradMat = Matrix::create(1, 128, false, useGpu);
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
   tempGradMat->randomizeUniform();
-  real* data = inputMat->getData();
   real* tempGradData = tempGradMat->getData();
 
-  LayerPtr upsampleLayerCPU =
+  paddle::LayerPtr upsampleLayerCPU =
       doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
 
 #ifdef PADDLE_WITH_CUDA
   useGpu = true;
-  inputGPUMat = Matrix::create(1, 128, false, useGpu);
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
   inputGPUMat->copyFrom(data, 128);
-  LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
       inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                   upsampleLayerGPU->getOutput("").value);
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+                           upsampleLayerGPU->getOutput("").value);
 
-  checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                   upsampleLayerGPU->getPrev(0)->getOutputGrad());
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
 #endif
 }
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index bd518d8598f5aa7c32298ed2110a96a2743536b3..12c9ea8cef79a6bdbd6e26c35612d0abbe00257b 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,11 +1,16 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
+
 set(PYTHON_PATH 
    ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
 function(trainer_test TARGET)
   add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
   add_test(NAME ${TARGET}
     COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endfunction()
 
 trainer_test(test_Compare)
@@ -22,11 +27,11 @@ if(WITH_PYTHON)
   add_test(NAME test_TrainerOnePass
     COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
           ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
 endif()
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
         ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 7a4977935ede4878c07f4fb6ba0dd76bf50acd42..6292e7fa52cd86c71724d9fe84ea622e98ff1e08 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,8 +2,8 @@
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
+  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
+set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
 
 if(APPLE)
     file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 556bcd1d7e60c27fece43de666e9531ab4203414..a075eeb83bda64133920f9ab0275eb6c0e0fb8c4 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
     get_filename_component(ABS_FIL ${filename} ABSOLUTE)
     get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
             DEPENDS ${ABS_FIL} protoc)
 endforeach()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index d074b0136d77fa5a1ce5c29cd52347d04475b029..7cbd7f22bf2968b29dc0665e893101b892808b5e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 5ea4d977f4d8d9eb56b1fefa16f429df6e2a15bb..f01d638efddd471d5667fded183b90c2d7d0a856 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -31,7 +31,7 @@ import regularizer
 import average
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
 from distribute_transpiler import DistributeTranspiler
 from distribute_transpiler_simple import SimpleDistributeTranspiler
 from concurrency import (Go, make_channel, channel_send, channel_recv,
@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
     'LoDTensor',
     'CPUPlace',
     'CUDAPlace',
+    'CUDAPinnedPlace',
     'Tensor',
     'ParamAttr',
     'WeightNormParamAttr',
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 9311fc9904eb730aa56e94a4e45a1479a67df641..7a2a81be9f269f262160cd082ec3a1d8e8e46811 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -17,7 +17,7 @@ import framework
 from framework import Program, default_main_program, default_startup_program, Parameter, Variable
 import optimizer
 from layer_helper import LayerHelper
-from distributed_spliter import *
+import distributed_splitter as splitter
 import math
 from . import core
 import debuger
@@ -36,7 +36,7 @@ class VarBlock:
 
 class UnionFind(object):
     """ Union-find data struct.
-    
+
     Union-find is a data struct that keeps track of a set of elements partitioned
     into a number of disjoint (non-overlapping) subsets.
 
@@ -138,7 +138,7 @@ class DistributeTranspiler:
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
-                  split_method=round_robin):
+                  split_method=splitter.round_robin):
         """
             Transpile the program to distributed data-parallelism programs.
             The main_program will be transformed to use a remote parameter server
@@ -303,7 +303,7 @@ class DistributeTranspiler:
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2 
+        # step 4.2
         # Iterate through the ops and append optimize op which
         # located on current pserver
         opt_op_on_pserver = []
@@ -312,7 +312,7 @@ class DistributeTranspiler:
                 opt_op_on_pserver.append(op)
         # step 4.3
         # Iterate through the ops, and if an op and the optimize ops
-        # which located on current pserver are in one set, then 
+        # which located on current pserver are in one set, then
         # append it into the sub program.
 
         # We try to put optimization program run parallelly, assume
@@ -408,11 +408,7 @@ class DistributeTranspiler:
         pserver_vars = pserver_program.global_block().vars
         created_var_map = dict()
         for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
+            tmpvar = s_prog.global_block().clone_variable(var)
             created_var_map[var.name] = tmpvar
 
         # 2. rename op outputs
@@ -708,11 +704,7 @@ class DistributeTranspiler:
                 varlist = [varlist]
 
             for var in varlist:
-                program.global_block().create_var(
-                    name=var.name,
-                    persistable=var.persistable,
-                    dtype=var.dtype,
-                    shape=var.shape)
+                program.global_block().clone_variable(var)
 
         optimize_block.append_op(
             type=opt_op.type,
@@ -760,7 +752,7 @@ class DistributeTranspiler:
 
     def _is_opt_op(self, op):
         # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
+        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc...
         if "Param" in op.input_names and \
             "LearningRate" in op.input_names:
             return True
diff --git a/python/paddle/fluid/distributed_spliter.py b/python/paddle/fluid/distributed_splitter.py
similarity index 78%
rename from python/paddle/fluid/distributed_spliter.py
rename to python/paddle/fluid/distributed_splitter.py
index d288b27ba00970897d8121b82a9d51d5cf4ece09..060c1df8ad2badc5132f45ff0f44d136d828faa1 100644
--- a/python/paddle/fluid/distributed_spliter.py
+++ b/python/paddle/fluid/distributed_splitter.py
@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints):
     """
     hash variable names to several endpoints.
 
-    :param varlist: a list of Variables
-    :return: a map of pserver endpoint -> varname
+    Args:
+        varlist(list): a list of Variables
+
+    Returns(dict): a map of pserver endpoint -> varname
     """
 
     def _hash_block(block_str, total):
@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints):
 
 def round_robin(varlist, pserver_endpoints):
     """
-    distribute variables to several endpoints.
+    Distribute variables to several endpoints.
+    Args:
+        varlist(list): a list of variables
+        pserver_endpoints(list): a list of pserver endpoints
+
+    Returns(list[int]): the endpoint for each variable
     """
-    assert (len(varlist) > len(pserver_endpoints))
+    assert (len(varlist) >= len(pserver_endpoints))
 
     eplist = []
     pserver_idx = 0
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e15456bfc0835066e3c899aea7e2cf642b4797d8..401e26f47420eee1c65545eba3e1ffb7f81b303e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -838,7 +838,7 @@ class Block(object):
 
     def sync_with_cpp(self):
         """
-        Sync with the desc on the c++ end.
+        Sync from the desc on the c++ end.
 
         This method is used to synchronize the c++ desc instance generated by backward.
         """
@@ -946,13 +946,20 @@ class Block(object):
             The new  variable cloned from 'var' in current block.
         """
         assert isinstance(var, Variable)
-        return self.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
+        ret_var = None
+        # make STEP_SCOPES var can be safely cloned.
+        if var.type == core.VarDesc.VarType.STEP_SCOPES:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
+        else:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=True)
+        return ret_var
 
 
 class Program(object):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index a2c830b3c943b114f3024f23f73f78bf87e1da34..1b3ba414ecb50cc4d75dcaecd1f31265334c9aec 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -26,25 +26,29 @@ class ParallelExecutor(object):
                  use_cuda,
                  num_threads=None,
                  allow_op_delay=False):
-        places = []
+        self._places = []
+        self._act_places = []
         if use_cuda:
             for i in xrange(core.get_cuda_device_count()):
                 p = core.Place()
-                p.set_place(core.CUDAPlace(i))
-                places.append(p)
+                self._act_places.append(core.CUDAPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
         else:
             for i in xrange(multiprocessing.cpu_count()):
                 p = core.Place()
-                p.set_place(core.CPUPlace())
-                places.append(p)
+                self._act_places.append(core.CPUPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
 
         if num_threads is None:
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                num_threads = len(places)
+                num_threads = len(self._places)
             else:
-                min(len(places) * 2, multiprocessing.cpu_count())
+                min(len(self._places) * 2, multiprocessing.cpu_count())
 
         startup = framework.default_startup_program()
         main = framework.default_main_program()
@@ -53,7 +57,7 @@ class ParallelExecutor(object):
         self.executor = core.ParallelExecutor(
             num_threads,
             True if use_cuda else False,  # use_event
-            places,
+            self._places,
             set([
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
@@ -65,8 +69,25 @@ class ParallelExecutor(object):
             allow_op_delay)
         self.scope = scope
 
-    def run(self, fetch_list):
+    def run(self, fetch_list, feed_dict={}):
+        """
+        :param fetch_list: A list of variable names that will be fetched.
+        :param feed_dict: A dict mapping for feed variable name to LoDTensor
+          or numpy array.
+        :return: fetched value list.
+        """
+        if not isinstance(feed_dict, dict):
+            raise TypeError("feed_dict should be a dict")
+
+        feed_tensor_dict = {}
+        for i, feed_name in enumerate(feed_dict):
+            feed_tensor = feed_dict[feed_name]
+            if not isinstance(feed_tensor, core.LoDTensor):
+                feed_tensor = core.LoDTensor()
+                feed_tensor.set(feed_dict[feed_name], self._act_places[0])
+            feed_tensor_dict[feed_name] = feed_tensor
+
         fetch_var_name = '@FETCHED_VAR_NAME@'
-        self.executor.run(fetch_list, fetch_var_name)
+        self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1b2d29a47fd050e40f83443432f8194984c71214..f10ef9b63412ecf74471f4fb94eb91ac72d5f8f9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -22,9 +22,9 @@ function(py_test_modules TARGET_NAME)
     set(multiValueArgs MODULES DEPS ARGS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
              ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 4b6e3fb69a12095c77f343515fe3b6d1f3fccb14..65606a0b4373b28036096cf046da5143a3b8bcd0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad_no_filter(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest):
     def test_check_grad_no_input(self):
         if self.dtype == np.float16:
             return
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index 3f739afd2516fdc2bdf3711d4780a1196c6f3f13..f8d5785fbfe64843f4aa3b96b24809df60980c74 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -115,18 +115,18 @@ class TestLookupTableWIsSelectedRows(OpTest):
         w_array = np.ones((len(rows), row_numel)).astype("float32")
         for i in range(len(rows)):
             w_array[i] *= i
-        ids_tensor = w_selected_rows.get_tensor()
-        ids_tensor.set(w_array, place)
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
 
         # create Out Variable
-        Out_tensor = scope.var('Out').get_tensor()
+        out_tensor = scope.var('Out').get_tensor()
 
         # create and run lookup_table operator
         lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
         lookup_table.run(scope, place)
 
         # get result from Out
-        result_array = np.array(Out_tensor)
+        result_array = np.array(out_tensor)
         # all(): return True if all elements of the iterable are true (or if the iterable is empty)
         for idx, row in enumerate(ids_array):
             assert (row[0] == result_array[idx]).all()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index a79e4b3e183eaef06be27a724893799923e84ac1..0f90e0e4df5da93f427b892d1be69f14625d2e29 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -21,13 +21,17 @@ import paddle.dataset.mnist as mnist
 import paddle.dataset.wmt16 as wmt16
 
 
-def simple_fc_net():
-    reader = fluid.layers.open_recordio_file(
-        filename='./mnist.recordio',
-        shapes=[[-1, 784], [-1, 1]],
-        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
-    img, label = fluid.layers.read_file(reader)
+def simple_fc_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_recordio_file(
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
     hidden = img
     for _ in xrange(4):
         hidden = fluid.layers.fc(
@@ -42,13 +46,18 @@ def simple_fc_net():
     return loss
 
 
-def fc_with_batchnorm():
-    reader = fluid.layers.open_recordio_file(
-        filename='./mnist.recordio',
-        shapes=[[-1, 784], [-1, 1]],
-        lod_levels=[0, 0],
-        dtypes=['float32', 'int64'])
-    img, label = fluid.layers.read_file(reader)
+def fc_with_batchnorm(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_recordio_file(
+            filename='./mnist.recordio',
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'])
+        img, label = fluid.layers.read_file(reader)
+
     hidden = img
     for _ in xrange(1):
         hidden = fluid.layers.fc(
@@ -135,7 +144,9 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-def SE_ResNeXt152Small(batch_size=2):
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+
     img = fluid.layers.fill_constant(
         shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
     label = fluid.layers.fill_constant(
@@ -150,9 +161,9 @@ def SE_ResNeXt152Small(batch_size=2):
     conv = fluid.layers.pool2d(
         input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
 
-    cardinality = 64
+    cardinality = 32
     reduction_ratio = 16
-    depth = [3, 8, 36, 3]
+    depth = [3, 4, 6, 3]
     num_filters = [128, 256, 512, 1024]
 
     for block in range(len(depth)):
@@ -185,30 +196,28 @@ class TestParallelExecutorBase(unittest.TestCase):
                                   memory_opt=True,
                                   iter=10,
                                   batch_size=None,
-                                  allow_op_delay=False):
+                                  allow_op_delay=False,
+                                  feed_dict={}):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
-            loss = method()
+            loss = method(use_feed=len(feed_dict) > 0)
             adam = fluid.optimizer.Adam()
             adam.minimize(loss)
             if memory_opt:
                 fluid.memory_optimize(main)
 
-            exe = fluid.ParallelExecutor(
-                loss_name=loss.name,
-                use_cuda=True,
-                allow_op_delay=allow_op_delay)
+            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
             if batch_size is not None:
                 batch_size *= fluid.core.get_cuda_device_count()
             begin = time.time()
-            first_loss, = exe.run([loss.name])
+            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
             first_loss = numpy.array(first_loss)
 
             for i in xrange(iter):
-                exe.run([])
+                exe.run([], feed_dict=feed_dict)
 
-            last_loss, = exe.run([loss.name])
+            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
             end = time.time()
 
             if batch_size is not None:
@@ -242,9 +251,19 @@ class TestMNIST(TestParallelExecutorBase):
         self.check_network_convergence(simple_fc_net)
         self.check_network_convergence(simple_fc_net, allow_op_delay=True)
 
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net, feed_dict={"image": img,
+                                      "label": label})
+
     def test_batchnorm_fc(self):
         self.check_network_convergence(fc_with_batchnorm)
-        self.check_network_convergence(fc_with_batchnorm, allow_op_delay=True)
+        img = numpy.zeros(shape=[32, 784], dtype='float32')
+        label = numpy.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm, feed_dict={"image": img,
+                                          "label": label})
 
 
 class TestResnet(TestParallelExecutorBase):
@@ -271,7 +290,7 @@ class TestResnet(TestParallelExecutorBase):
         batch_size = 2
         self.check_network_convergence(
             functools.partial(
-                SE_ResNeXt152Small, batch_size=batch_size),
+                SE_ResNeXt50Small, batch_size=batch_size),
             iter=20,
             batch_size=batch_size)
 
@@ -400,7 +419,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
 import transformer_model
 
 
-def transformer():
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
     return transformer_model.transformer(
         ModelHyperParams.src_vocab_size + 1,
         ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index c21138c13e6753f9dfcbd7d439269f7cf9a04f23..bcbc02a2baa46b9ab583ecf3006bd3262e6038fd 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
 
         self.attrs = {
             'min_sizes': self.min_sizes,
-            'max_sizes': self.max_sizes,
             'aspect_ratios': self.aspect_ratios,
             'variances': self.variances,
             'flip': self.flip,
@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
             'step_h': self.step_h,
             'offset': self.offset
         }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
 
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        return
-
     def setUp(self):
         self.op_type = "prior_box"
         self.set_data()
 
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
     def init_test_params(self):
-        self.layer_w = 4
-        self.layer_h = 4
+        self.layer_w = 32
+        self.layer_h = 32
 
-        self.image_w = 20
-        self.image_h = 20
+        self.image_w = 40
+        self.image_h = 40
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
 
         self.min_sizes = [2, 4]
         self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.max_sizes = [5, 10]
-        self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
+        self.set_max_sizes()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
         self.clip = True
 
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 1:
+        if len(self.max_sizes) > 0:
             self.num_priors += len(self.max_sizes)
         self.offset = 0.5
 
@@ -105,35 +106,27 @@ class TestPriorBoxOp(OpTest):
                 idx = 0
                 for s in range(len(self.min_sizes)):
                     min_size = self.min_sizes[s]
-                    c_w = c_h = min_size / 2.
-                    out_boxes[h, w, idx, :] = [
-                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
-                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
-                    ]
-                    idx += 1
-
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                    # rest of priors
+                    for r in range(len(self.real_aspect_ratios)):
+                        ar = self.real_aspect_ratios[r]
+                        c_w = min_size * math.sqrt(ar) / 2
+                        c_h = (min_size / math.sqrt(ar)) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
 
-                    # rest of priors
-                    for r in range(len(self.real_aspect_ratios)):
-                        ar = self.real_aspect_ratios[r]
-                        if math.fabs(ar - 1.) < 1e-6:
-                            continue
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
+                    if len(self.max_sizes) > 0:
+                        max_size = self.max_sizes[s]
+                        # second prior: aspect_ratio = 1,
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
+
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
             out_boxes = np.clip(out_boxes, 0.0, 1.0)
@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
         self.out_var = out_var.astype('float32')
 
 
+class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index e4cf4a8bce8a53c0348130716dc18c61ac9a5913..f98a8bbc68a4315df3ae761f2e52b8f11cb620c6 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -19,9 +19,9 @@ from paddle.fluid.framework import Program
 
 class TestOpDesc(unittest.TestCase):
     def test_op_desc(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op = block.append_op()
         self.assertIsNotNone(op)
@@ -67,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
 
         self.assertEqual(8, len(op.attr_names()))
 
-        op.set_block_attr("block_attr", prog.block(0))
+        op.set_block_attr("block_attr", program_desc.block(0))
         self.assertEqual(0, op.block_attr("block_attr"))
 
         mul_op = block.append_op()
@@ -88,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
         del program_desc
 
     def test_append_block(self):
-        prog_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog_desc)
-        block_root = prog_desc.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block_root = program_desc.block(0)
         self.assertIsNotNone(block_root)
         self.assertEqual(block_root.id, 0)
-        block1 = prog_desc.append_block(block_root)
-        block2 = prog_desc.append_block(block1)
+        block1 = program_desc.append_block(block_root)
+        block2 = program_desc.append_block(block1)
         self.assertIsNotNone(block1)
         self.assertEqual(block1.id, block2.parent)
         self.assertEqual(block_root.id, block1.parent)
-        block3 = prog_desc.append_block(block_root)
+        block3 = program_desc.append_block(block_root)
         self.assertEqual(block3.parent, block_root.id)
-        self.assertEqual(prog_desc.block(1).id, 1)
-        self.assertEqual(4, prog_desc.num_blocks())
+        self.assertEqual(program_desc.block(1).id, 1)
+        self.assertEqual(4, program_desc.num_blocks())
 
 
 class TestVarDesc(unittest.TestCase):
@@ -162,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         var1 = block.var("var1")
         var2 = block.var("var2")
@@ -175,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
@@ -189,9 +189,9 @@ class TestBlockDesc(unittest.TestCase):
 
     def test_remove_op(self):
         program = Program()
-        prog = program.desc
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = program.desc
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
 
         op0 = block.append_op()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index c498b23db12cd83304f4c3a3d1f15bd68ad4f0b6..3126293f9d8e52daa866be5fc1533648a33f3363 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
             self.check_with_place(place)
 
 
+class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        row_width = 12
+        # create and initialize Grad Variable
+        grad_height = 10
+        grad_rows = [0, 4, 7]
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(grad_height)
+        grad_selected_rows.set_rows(grad_rows)
+        grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
+        grad_array[0, 0] = 2.0
+        grad_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_array, place)
+
+        # create and initialize Param Variable
+        # create and initialize W Variable
+        param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
+
+        # init Param
+        w_selected_rows = scope.var('Param').get_selected_rows()
+        w_selected_rows.set_height(len(param_rows))
+        w_selected_rows.set_rows(param_rows)
+        w_array = np.ones((len(param_rows), row_width)).astype("float32")
+        for i in range(len(param_rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        w_before_optimize = np.array(w_tensor)
+
+        # create and initialize LeraningRate Variable
+        lr_value = 0.1
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), lr_value).astype("float32")
+        lr.set(lr_array, place)
+
+        # optimize with Python
+        w_after_optimize = np.copy(w_before_optimize)
+        for index, id in enumerate(grad_rows):
+            w_after_optimize[id] = w_before_optimize[
+                id] - lr_value * grad_array[index]
+
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        # get and compare result
+        result_array = np.array(w_tensor)
+        assert (result_array == w_after_optimize).all()
+
+    def test_sparse_parameter_sgd(self):
+        places = [core.CPUPlace()]
+        # do not support GPU kernel currently
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 33d60c7e31ce0817ad26ea1c1c974339936052d3..279f3073f73d1c36f54bb901d92441a7403ac23f 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
         self.use_cudnn = True
 
 
+class TestSoftmaxFP16Op(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
 class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
     def init_kernel_type(self):
         self.use_cudnn = True
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 580aef935b5cec385a88fb0b4f5b9a5ddeddb40c..30e0b9906c406d846d4b086a1a1c89587394afea 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
-  COMMAND
-  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
+  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 8a318879630cd491573afcaf798dda2ca75e335d..44a75a60cc78e85f85d111a911999b7812db0f49 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -2,7 +2,6 @@
 
 set -e
 cd `dirname $0`
-export PYTHONPATH=$PWD/../../../../
 
 protostr=$PWD/protostr
 . file_list.sh
diff --git a/python/setup.py.in b/python/setup.py.in
index 08a448934d3248b46618acdef9e1894f94a93893..5e7096e225e08d19e89051603bbc07eff945c78a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -58,7 +58,7 @@ def mkl():
             'istaged': ISTAGED,
             'with_mkl': '@WITH_MKL@'})
 
-write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
@@ -107,9 +107,10 @@ package_dir={
     # So that package points to other directory.
     'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
 if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
     
 
 paddle_rt_lib_dir = 'lib'