Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into multigpumultinode

d2760bda · typhoonzero · 063868fb · 535646cf · d2760bda · d2760bda
272 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 # clion workspace.
 cmake-build-*
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL           "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
 add_dependencies(snappystream extern_snappystream)
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
 )
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 ExternalProject_Add(
    extern_zlib

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -251,7 +251,7 @@ function(cc_test TARGET_NAME)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction(cc_test)
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS ENVS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
  endif()
 endfunction()

--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
 add_subdirectory(api)
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h
 Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
-![](./images/asgd.gif)
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>
 We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.

--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@@ -114,13 +114,13 @@ current thread under two conditions:
 #### Channel Send
 <p align="center">
-<img src="./images/channel_send.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
 </p>
 #### Channel Receive
 <p align="center">
-<img src="./images/channel_recv.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
 </p>
 ## Limitations and Considerations

--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -23,21 +23,25 @@ The following table compares concepts in Fluid and Go
 <td>user-defined functions </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
 </tr>
 <tr>
 <td>control-flow and built-in functions </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
+<td></td>
 </tr>
 <tr>
 <td>goroutines, channels </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
 </tr>
 <tr>
 <td>runtime </td>
 <td>
 <a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
+<td></td>
 </tr>
 </tbody>
 </table>

--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -254,7 +254,7 @@ only one case will be executed.
 ### select_op flow
 <p align="center">
-<img src="./images/select_op_workflow.png"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
 </p>
 The select algorithm is inspired by golang's select routine.  Please refer to

--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-<img src="src/compiler.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-<img src="src/paddle-compile.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-<img src="src/distributed_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
-<img src="src/remote_executor.png" width="500" align="center" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-<img src="src/local_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
 ### Training Data

--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 ## Transpiler
-<img src="src/multi-threads/single-thread@3x.png" width="300">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
 After converted:
-<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
 ## Implement

--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
-<img src="src/local-graph.png" width="300"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
 After converting:
-<img src="src/dist-graph.png" width="700"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
-<img src="src/sparse_update.png" width="700" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
 ### Benefits
 - Model parallelism becomes easier to implement: it is an extension to

--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
 ## RNN Algorithm Implementation
 <p align="center">
-<img src="./rnn.jpg"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
 </p>
 The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
 There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
 <p align="center">
-<img src="./rnn.png"/><br/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
 Figure 2 illustrates the RNN's data flow
 </p>
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
 The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
 <p align="center">
-<img src="./2_level_rnn.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
 </p>
 ```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
 <p align="center">
-<img src="./rnn_2level_data.png"/>
+<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
 </p>
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
 The following graph showes the training computational process of `batch_norm_op`:
-<img src="../images/batch_norm_op_kernel.png" width="800"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
 cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
 `is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
 <div align=center>
-<img src="../images/batch_norm_fork.png" width="500"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
 </div>
 Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.

--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -6,17 +6,17 @@ A central problem in machine learning is how to design an algorithm that will pe
 ### Parameter Norm Penalties
 Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
-<img src="./images/loss_equation.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
 The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
 The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
 ##### L2 Regularization:
-<img src="./images/l2_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
 ##### L1 Regularization
-<img src="./images/l1_regularization.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
 A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
@@ -40,11 +40,11 @@ The idea of building ops for regularization is in sync with the refactored Paddl
 Below is an example of a really simple feed forward neural network.
-<img src="./images/feed_forward.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
 The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
-<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
 ### Python API implementation for Regularization
@@ -64,9 +64,3 @@ Since we want to create the regularization ops in a lazy manner, the regularizat
 #### High-level API
 In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -116,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
 - **One** CTC-loss layer
 <div align="center">
-<img src="images/ds2_network.png" width=350><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
 Figure 1. Archetecture of Deep Speech 2 Network.
 </div>
@@ -208,7 +208,7 @@ TODO by Assignees
 ### Beam Search with CTC and LM
 <div align="center">
-<img src="images/beam_search.png" width=600><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
 Figure 2. Algorithm for CTC Beam Search Decoder.
 </div>

--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i
 ## LoD and shape changes during decoding
 <p align="center">
-  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
 </p>
 According to the image above, the only phase that changes the LoD is beam search.

--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@@ -7,14 +7,14 @@ It applies several important concepts in machine learning system design, includi
 In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
 <p align="center">
-<img src="./test.dot.png" width = "35%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
 Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
 </p>
 The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
 <p align="center">
-<img src="./dcgan.png" width = "90%" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
 Figure 2. Photo borrowed from the original DC-GAN paper.
 </p>

--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -37,7 +37,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
 使用`twine`工具上传即可。
-<img src="ci_build_whl.png">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
 * 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。

--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program?  There many C++ A
 The overall flow is shown as the following figure.
-<img src="./images/profiler.png" align="center"/><br/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
 ### Event

--- a/doc/fluid/images/2_level_rnn.dot
+++ b/doc/fluid/images/2_level_rnn.dot
+digraph G {
+  rnn [label="1st level RNN" shape=box]
+  subgraph cluster0 {
+    label = "time step 0"
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+    rnn1 [label="2nd level RNN" shape=box]
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+  subgraph cluster1 {
+    label = "time step 1"
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+    rnn2 [label="2nd level RNN" shape=box]
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+  subgraph cluster2 {
+    label = "time step 2"
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+    rnn3 [label="2nd level RNN" shape=box]
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
--- a/doc/fluid/images/2_level_rnn.png
+++ b/doc/fluid/images/2_level_rnn.png
--- a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
+++ b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
--- a/doc/fluid/images/asgd.gif
+++ b/doc/fluid/images/asgd.gif
--- a/doc/fluid/images/batch_norm_fork.dot
+++ b/doc/fluid/images/batch_norm_fork.dot
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
--- a/doc/fluid/images/batch_norm_fork.png
+++ b/doc/fluid/images/batch_norm_fork.png
--- a/doc/fluid/images/batch_norm_op_kernel.png
+++ b/doc/fluid/images/batch_norm_op_kernel.png
--- a/doc/fluid/images/beam_search.png
+++ b/doc/fluid/images/beam_search.png
--- a/doc/fluid/images/ci_build_whl.png
+++ b/doc/fluid/images/ci_build_whl.png
--- a/doc/fluid/images/compiler.graffle
+++ b/doc/fluid/images/compiler.graffle
--- a/doc/fluid/images/compiler.png
+++ b/doc/fluid/images/compiler.png
--- a/doc/fluid/images/control_flow_graph.png
+++ b/doc/fluid/images/control_flow_graph.png
--- a/doc/fluid/images/dataflow_equations.png
+++ b/doc/fluid/images/dataflow_equations.png
--- a/doc/fluid/images/dcgan.png
+++ b/doc/fluid/images/dcgan.png
--- a/doc/fluid/images/deep_learning.png
+++ b/doc/fluid/images/deep_learning.png
--- a/doc/fluid/images/dist-graph.graffle
+++ b/doc/fluid/images/dist-graph.graffle
--- a/doc/fluid/images/dist-graph.png
+++ b/doc/fluid/images/dist-graph.png
--- a/doc/fluid/images/distributed_architecture.graffle
+++ b/doc/fluid/images/distributed_architecture.graffle
--- a/doc/fluid/images/distributed_architecture.png
+++ b/doc/fluid/images/distributed_architecture.png
--- a/doc/fluid/images/ds2_network.png
+++ b/doc/fluid/images/ds2_network.png
--- a/doc/fluid/images/feed_forward.png
+++ b/doc/fluid/images/feed_forward.png
--- a/doc/fluid/images/feed_forward_regularized.png
+++ b/doc/fluid/images/feed_forward_regularized.png
--- a/doc/fluid/images/fluid-compiler.graffle
+++ b/doc/fluid/images/fluid-compiler.graffle
--- a/doc/fluid/images/fluid-compiler.png
+++ b/doc/fluid/images/fluid-compiler.png
--- a/doc/fluid/images/graph_construction_example.bash
+++ b/doc/fluid/images/graph_construction_example.bash
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
--- a/doc/fluid/images/graph_construction_example.dot
+++ b/doc/fluid/images/graph_construction_example.dot
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+        ////////// The optimizaiton part //////////
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+        ////////// Groupings //////////
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
--- a/doc/fluid/images/graph_construction_example_all.png
+++ b/doc/fluid/images/graph_construction_example_all.png
--- a/doc/fluid/images/graph_construction_example_forward_backward.png
+++ b/doc/fluid/images/graph_construction_example_forward_backward.png
--- a/doc/fluid/images/graph_construction_example_forward_only.png
+++ b/doc/fluid/images/graph_construction_example_forward_only.png
--- a/doc/fluid/images/l1_regularization.png
+++ b/doc/fluid/images/l1_regularization.png
--- a/doc/fluid/images/l2_regularization.png
+++ b/doc/fluid/images/l2_regularization.png
--- a/doc/fluid/images/local-graph.graffle
+++ b/doc/fluid/images/local-graph.graffle
--- a/doc/fluid/images/local-graph.png
+++ b/doc/fluid/images/local-graph.png
--- a/doc/fluid/images/local_architecture.graffle
+++ b/doc/fluid/images/local_architecture.graffle
--- a/doc/fluid/images/local_architecture.png
+++ b/doc/fluid/images/local_architecture.png
--- a/doc/fluid/images/lookup_table.png
+++ b/doc/fluid/images/lookup_table.png
--- a/doc/fluid/images/lookup_table_training.png
+++ b/doc/fluid/images/lookup_table_training.png
--- a/doc/fluid/images/loss_equation.png
+++ b/doc/fluid/images/loss_equation.png
--- a/doc/fluid/images/multi-threads.graffle
+++ b/doc/fluid/images/multi-threads.graffle
--- a/doc/fluid/images/multi-threads@3x.png
+++ b/doc/fluid/images/multi-threads@3x.png
--- a/doc/fluid/images/multigpu_allreduce.graffle
+++ b/doc/fluid/images/multigpu_allreduce.graffle
--- a/doc/fluid/images/multigpu_allreduce.png
+++ b/doc/fluid/images/multigpu_allreduce.png
--- a/doc/fluid/images/multigpu_before_convert.graffle
+++ b/doc/fluid/images/multigpu_before_convert.graffle
--- a/doc/fluid/images/multigpu_before_convert.png
+++ b/doc/fluid/images/multigpu_before_convert.png
--- a/doc/fluid/images/multiple_reader.png
+++ b/doc/fluid/images/multiple_reader.png
--- a/doc/fluid/images/paddle-compile.graffle
+++ b/doc/fluid/images/paddle-compile.graffle
--- a/doc/fluid/images/paddle-compile.png
+++ b/doc/fluid/images/paddle-compile.png
--- a/doc/fluid/images/pprof_1.png
+++ b/doc/fluid/images/pprof_1.png
--- a/doc/fluid/images/pprof_2.png
+++ b/doc/fluid/images/pprof_2.png
--- a/doc/fluid/images/profiler.png
+++ b/doc/fluid/images/profiler.png
--- a/doc/fluid/images/readers.png
+++ b/doc/fluid/images/readers.png
--- a/doc/fluid/images/remote_executor.graffle
+++ b/doc/fluid/images/remote_executor.graffle
--- a/doc/fluid/images/remote_executor.png
+++ b/doc/fluid/images/remote_executor.png
--- a/doc/fluid/images/rnn.dot
+++ b/doc/fluid/images/rnn.dot
+digraph G {
+  label = "simple RNN implementation" 
+  ranksep=2;
+  //graph [nodesep=1, ranksep=1];
+  node[nodesep=1]
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+}
--- a/doc/fluid/images/rnn.jpg
+++ b/doc/fluid/images/rnn.jpg
--- a/doc/fluid/images/rnn.png
+++ b/doc/fluid/images/rnn.png
--- a/doc/fluid/images/rnn_2level_data.dot
+++ b/doc/fluid/images/rnn_2level_data.dot
+digraph G {
+  chapter [label="chapter"]
+  subgraph cluster0 {
+    label = "paragraph 0"
+    top_rnn0[label="top rnn step 0" shape=box]
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+  subgraph cluster1{
+    label = "paragraph 1"
+    top_rnn1[label="top rnn step 1" shape=box]
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+  subgraph cluster_p0 {
+    label = "sentence 0"
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+}
--- a/doc/fluid/images/rnn_2level_data.png
+++ b/doc/fluid/images/rnn_2level_data.png
--- a/doc/fluid/images/single-thread@3x.png
+++ b/doc/fluid/images/single-thread@3x.png
--- a/doc/fluid/images/sparse_update.graffle
+++ b/doc/fluid/images/sparse_update.graffle
--- a/doc/fluid/images/sparse_update.png
+++ b/doc/fluid/images/sparse_update.png
--- a/doc/fluid/images/test.dot
+++ b/doc/fluid/images/test.dot
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+    D_f -> g_loss;
+    label2 -> g_loss;
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+    d_loss [color=red];
+    g_loss [color=green];
+}
--- a/doc/fluid/images/test.dot.png
+++ b/doc/fluid/images/test.dot.png
--- a/doc/fluid/images/theta_star.gif
+++ b/doc/fluid/images/theta_star.gif
--- a/doc/fluid/images/timeline.jpeg
+++ b/doc/fluid/images/timeline.jpeg
--- a/doc/fluid/images/tracing.jpeg
+++ b/doc/fluid/images/tracing.jpeg
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle

--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
 add_subdirectory(api)
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
                  ${CMAKE_CURRENT_SOURCE_DIR}
                  ${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
--- a/doc/v2/howto/rnn/recurrent_group_en.md
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
 # Recurrent Group Tutorial
-TBD
+## Overview
+Sequential data is common in natural language processing.
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
+## Related Concepts
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+### Input Example
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+## Sequence-level RNN Introduction
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+## Usage of Sequence-level RNN
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
    ${START_END}
 )
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
    DEPENDS _swig_paddle
 )
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)

--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
 py_test(testTrain SRCS testTrain.py)
 py_test(testMatrix SRCS testMatrix.py)
 py_test(testVector SRCS testVector.py)

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t groups);
+/**
+ * @brief   Upsample forward.
+ * @param[in]   inputData   input data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize   the batch size of the input.
+ * @param[in]   imgSizeH    image height.
+ * @param[in]   imgSizeW    image width.
+ * @param[in]   channels    the input channels.
+ * @param[in]   outputH     the output height.
+ * @param[in]   outputW     the output widht.
+ * @param[out]  outputData  output data.
+ */
+extern void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData);
+/**
+ * @brief   Upsample backward.
+ * @param[in]   outputGradData  the output grad data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize       the batch size of the input.
+ * @param[in]   imgSizeH        image height.
+ * @param[in]   imgSizeW        image width.
+ * @param[in]   channels        the input channels.
+ * @param[in]   outputH         the output height.
+ * @param[in]   outputW         the output widht.
+ * @param[out]  inputGradData   the input grad data.
+ */
+extern void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData);
 #endif  // HL_CNN_H_
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad,
                               size_t featLen,
                               size_t group) {}
+inline void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData) {}
+inline void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData) {}
 #endif  // HL_CNN_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad,
      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
  CHECK_SYNC("hl_maxout_backward failed");
 }
+__global__ void upsampleForwardCompute(real* input_data,
+                                       real* mask_data,
+                                       size_t nthreads,
+                                       size_t in_h,
+                                       size_t in_w,
+                                       size_t out_h,
+                                       size_t out_w,
+                                       real* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    output_data[offset + upsample_idx] = input_data[index];
+  }
+}
+__global__ void upsampleBackwardCompute(real* out_grad,
+                                        real* mask_data,
+                                        size_t nthreads,
+                                        size_t in_h,
+                                        size_t in_w,
+                                        size_t out_h,
+                                        size_t out_w,
+                                        real* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    input_grad[index] = out_grad[offset + upsample_idx];
+  }
+}
+void hl_upsample_forward(real* inputData,
+                         real* maskData,
+                         size_t batchSize,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t channels,
+                         size_t outputH,
+                         size_t outputW,
+                         real* outputData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
+                                                              maskData,
+                                                              num_kernels,
+                                                              imgSizeH,
+                                                              imgSizeW,
+                                                              outputH,
+                                                              outputW,
+                                                              outputData);
+  CHECK_SYNC("hl_upsample_forward failed");
+}
+void hl_upsample_backward(real* outputGradData,
+                          real* maskData,
+                          size_t batchSize,
+                          size_t imgSizeH,
+                          size_t imgSizeW,
+                          size_t channels,
+                          size_t outputH,
+                          size_t outputW,
+                          real* inputGradData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
+                                                               maskData,
+                                                               num_kernels,
+                                                               imgSizeH,
+                                                               imgSizeW,
+                                                               outputH,
+                                                               outputW,
+                                                               inputGradData);
+  CHECK_SYNC("hl_upsample_backward failed");
+}
--- a/paddle/fluid/framework/.clang-format
+++ b/paddle/fluid/framework/.clang-format
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
--- a/paddle/fluid/memory/.clang-format
+++ b/paddle/fluid/memory/.clang-format
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
--- a/paddle/fluid/memory/memory.cc
+++ b/paddle/fluid/memory/memory.cc
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
--- a/paddle/fluid/memory/memory_test.cc
+++ b/paddle/fluid/memory/memory_test.cc
--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
--- a/paddle/fluid/operators/.clang-format
+++ b/paddle/fluid/operators/.clang-format
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ b/paddle/fluid/operators/detail/bytebuffer_stream.h
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ b/paddle/fluid/operators/detail/grpc_client.h
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ b/paddle/fluid/operators/detail/grpc_server.h
--- a/paddle/fluid/operators/detail/grpc_server_test.cc
+++ b/paddle/fluid/operators/detail/grpc_server_test.cc
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ b/paddle/fluid/operators/detail/proto_encoder_helper.h
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.h
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ b/paddle/fluid/operators/detail/simple_block_queue.h
--- a/paddle/fluid/operators/detail/variable_response.cc
+++ b/paddle/fluid/operators/detail/variable_response.cc
--- a/paddle/fluid/operators/detail/variable_response.h
+++ b/paddle/fluid/operators/detail/variable_response.h
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/reshape_op.h
+++ b/paddle/fluid/operators/reshape_op.h
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/platform/.clang-format
+++ b/paddle/fluid/platform/.clang-format
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
--- a/paddle/fluid/pybind/.clang-format
+++ b/paddle/fluid/pybind/.clang-format
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/fluid/pybind/tensor_py_test.cc
+++ b/paddle/fluid/pybind/tensor_py_test.cc
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
--- a/paddle/fluid/string/.clang-format
+++ b/paddle/fluid/string/.clang-format
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
--- a/paddle/gserver/layers/UpsampleLayer.cpp
+++ b/paddle/gserver/layers/UpsampleLayer.cpp
--- a/paddle/gserver/layers/UpsampleLayer.h
+++ b/paddle/gserver/layers/UpsampleLayer.h
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/test_Upsample.cpp
+++ b/paddle/gserver/tests/test_Upsample.cpp
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/distributed_spliter.py
+++ b/python/paddle/fluid/distributed_spliter.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
--- a/python/setup.py.in
+++ b/python/setup.py.in