diff --git a/.gitignore b/.gitignore
index 2badc3bdaa52f2608183fa34393719be66630654..9e3a0b499f9f42856429f3a42bef313ea3df3699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
# clion workspace.
cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index df3f0c7f0c31efaa127515bb98e5668b8f9df199..796bcf28a1dfb308ccb7a2f839742c5c2fcf2002 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf..8f7a3bf8eeaef75c8840f4ea318b484d33249bb7 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
+
add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..a631ad14b18310598f7eea3a51839d61a9e456ff 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -62,7 +62,8 @@ ExternalProject_Add(
)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 20b8506e678af4db6ccb65bef99d28e085a67bf2..c3d73235453c8c9fd2859c3ab142888e8bda2dbe 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
ExternalProject_Add(
extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3fe750f47efc149bb1af6086841bffd5dd8e85fd..e8bc285bdc95e213b9da2ee388078349a46d2798 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -251,7 +251,7 @@ function(cc_test TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endfunction(cc_test)
@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
- COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+ COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
- WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endfunction()
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
index 9fe79323ef9377a459d8405cfa74c88c52ce9346..8086507bb4b7e870ad6d6091945ed07a00b5100b 100644
--- a/doc/fluid/CMakeLists.txt
+++ b/doc/fluid/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_docs gen_proto_py)
+add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_fluid_docs_cn gen_proto_py)
+add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
index ca40dfb9644cea69329be0ec231378506c138bc0..48b396f0786adad1ba6cd41f72497f853e54bc38 100644
--- a/doc/fluid/api/CMakeLists.txt
+++ b/doc/fluid/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
index 53d601d3a9a37e8adad519833bb6fa2dc48023a0..940d37fb31dcd0c50ea6c4c42b052d7cb23a9c47 100644
--- a/doc/fluid/design/algorithm/parameter_average.md
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h
Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for 
. The averaging is done as follows:
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for 
. The averaging is done as follows:
-
+
+
+
We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
index a00a3325e7b49381f0f82ebbf32b74683f02de5f..df67438bcc741ac521b00ee962fc13c93db21182 100644
--- a/doc/fluid/design/concurrent/channel.md
+++ b/doc/fluid/design/concurrent/channel.md
@@ -2,7 +2,7 @@
## Introduction
-A Channel is a data structure that allows for synchronous interprocess
+A Channel is a data structure that allows for synchronous interprocess
communication via message passing. It is a fundemental component of CSP
(communicating sequential processes), and allows for users to pass data
between threads without having to worry about synchronization.
@@ -18,7 +18,7 @@ Creates a new channel that takes in variables of a specific dtype.
- **fluid.make_channel(dtype, capacity=0)**
- **dtype**: The data type of variables being sent/received through channel
- - **capacity**: The capacity of the channel. A capacity of 0 represents
+ - **capacity**: The capacity of the channel. A capacity of 0 represents
an unbuffered channel. Capacity > 0 represents a buffered channel
```
@@ -40,8 +40,8 @@ fluid.channel_close(ch)
### Send data to a channel
-Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
-`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+Sends a variable to a channel. Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
`ChannelHolder` are supported.
By default, the data of the Variable is moved from the sender to the receiver,
@@ -52,7 +52,7 @@ however the user can optionally copy the data before performing the send.
- **variable**: The variable to send to the channel
- **is_copy**: If set to True, channel_send will perform a variable assign
to copy the source variable to a new variable to be sent.
-
+
```
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
@@ -68,7 +68,7 @@ receiving variable.
- **channel**: The channel to receive the variable from
- **return_variable**: The destination variable used to store the data of the
variable received from the channel
-
+
```
ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
@@ -84,9 +84,9 @@ internal queues, locks, and conditional variables.
### QueueMessage
QueueMessage encapsulates the state of the channel send/receive operation to be
-put in the **sendq/recvq**. It contains a condition variable used to lock the
+put in the **sendq/recvq**. It contains a condition variable used to lock the
thread (when there are no available sends/receives). In addition, it contains
-a callback function to notify a thread when the QueueMessage is being
+a callback function to notify a thread when the QueueMessage is being
processed by the channel.
### Queues
@@ -108,21 +108,21 @@ channel_recv operation will put a new QueueMessage on the recvq and block the
current thread under two conditions:
1. The channel is buffered and there is no data on the buff_
2. The channel is unbuffered and does not have a sender
-
+
### State diagram
#### Channel Send
-
+
-
+
#### Channel Receive
-
+
-
+
## Limitations and Considerations
### Variable Copy
@@ -135,5 +135,5 @@ be sent before it is sent.
Please note that this is acheived by adding an **assign** operator and creating
a temporary variable that is sent in place of the original variable. Please
-note that **assign** operator has limited support for only certain variables
+note that **assign** operator has limited support for only certain variables
datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
index 64602166065af28309d7a01fdeb7076a9b0a081a..1859f983e9133674e69ecd506d7683ea926b2b8f 100644
--- a/doc/fluid/design/concurrent/concurrent_programming.md
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -23,21 +23,25 @@ The following table compares concepts in Fluid and Go
user-defined functions |
layers |
+ |
control-flow and built-in functions |
intrinsics/operators |
+ |
goroutines, channels |
class ThreadPool |
+ |
runtime |
class Executor |
+ |
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
index 52c226bc94a4e8bfc5588705d7f65328840e91cc..4fcae57cc7932cdaebe549486e7f7cebf0bd038a 100644
--- a/doc/fluid/design/concurrent/select_op.md
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -2,13 +2,13 @@
## Introduction
-In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
-statement lets a goroutine wait on multiple communication operations at the
-same time. The **select** blocks until one of its cases can run, then
-executes the case. If multiple cases are ready to run, then one case is
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case. If multiple cases are ready to run, then one case is
choosen at random to be executed.
-With the introduction of CSP for Paddle, we mimic this behavior by
+With the introduction of CSP for Paddle, we mimic this behavior by
creating a ***select_op***.
## How to use it
@@ -17,11 +17,11 @@ The **select_op** is available as a c++ operator. However most users
will prefer to use the much simplier Python API.
- **fluid.Select()**: Creates a select operator and adds it to the current
-block within the main program. Also creates a sub block and adds it to the
-main program. This sub block is used to hold all variables and operators
+block within the main program. Also creates a sub block and adds it to the
+main program. This sub block is used to hold all variables and operators
used by the case statements.
-
-Within the select block, users can add cases by
+
+Within the select block, users can add cases by
calling **select.case** or **select.default** method.
- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
@@ -37,13 +37,13 @@ execute.
```
ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
+
x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
-
+
while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
while_op = While(cond=while_cond)
-
+
with while_op.block():
with fluid.Select() as select:
with select.case(fluid.channel_send, channel, x):
@@ -99,17 +99,17 @@ blocks {
}
}
// Create "select" operator.
- // inputs:
+ // inputs:
// X: All input variables used by operators within the select block
// case_to_execute: Variable filled in by select_op when it determines
// which case to execute.
//
// outputs:
- // Out: All output variables referenced by operators within select block.
- //
+ // Out: All output variables referenced by operators within select block.
+ //
// attrs:
// sub_block: The block id containing the select "cases"
- // cases: Serialized list of all cases in the select op.
+ // cases: Serialized list of all cases in the select op.
// Each case is serialized as: ',,,'
// where type is 0 for default, 1 for send, and 2 for receive.
// No channel and values are needed for default cases.
@@ -150,7 +150,7 @@ into **X**. It will also create a temp variable called **case_to_execute**. Th
filled in by the select_op after it has completed processing the case statements.
If there are no available cases to execute (ie: all cases are blocked on channel operations, and
-there is no default statement), then the select_op will block the current thread. The thread will
+there is no default statement), then the select_op will block the current thread. The thread will
unblock once there is a channel operation affecting one of the case statements, at which point, the
**select_op** will set the **case_to_execute** variable to the index of the case to execute.
@@ -247,17 +247,17 @@ blocks {
```
-Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
-equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**). Since each case index is unique in this sub-block,
only one case will be executed.
### select_op flow
-
+
-The select algorithm is inspired by golang's select routine. Please refer to
+The select algorithm is inspired by golang's select routine. Please refer to
http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
## Backward Pass
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index a405cb6aaf80b9d2e8a1a9c774ca85cc7e62bbab..229cb47c17d633be6848bb35e58d33ec9b47ec3b 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-
+
PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-
+
The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-
+
The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
@@ -152,7 +152,7 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on
Cluster environment.
-
+
`RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-
+
### Training Data
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
index a8d8ee0422acc84835170a44eb83f9b5f0c6bb40..38222d083084ebfca3099ce96b47868c42d55101 100644
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
## Transpiler
-
+
After converted:
-
+
## Implement
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 6ce48dfbfce8b094684b412ebfda7e505ddc30ae..73c85da5e89eee0ac7857a0b808bc64ae673fdad 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
Below is an example of converting the user defined graph to the
subgraphs for the trainer and the parameter server:
-
+
After converting:
-
+
1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program.
@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
non-zero gradient data. So when we do parameter optimization both locally and remotely,
we only need to send those non-zero rows to the optimizer operators:
-
-
+
### Benefits
- Model parallelism becomes easier to implement: it is an extension to
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
index 6f414e5549b149bc88fb252085ff56dbb06730f8..7b61b050f640814d6949cf6847b431da53d59581 100644
--- a/doc/fluid/design/dynamic_rnn/rnn.md
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
## RNN Algorithm Implementation
-
+
The above diagram shows an RNN unrolled into a full network.
@@ -22,7 +22,7 @@ There are several important concepts here:
There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
-
+
Figure 2 illustrates the RNN's data flow
@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
-
+
```python
@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
-
+
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
index d1392619c42d9206bf4bddcd33ad11b033e6cbdb..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37 100644
--- a/doc/fluid/design/modules/batch_norm_op.md
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -2,7 +2,7 @@
## What is batch normalization
-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
The principle of batch normalization can be summarized into a simple function:
@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
The following graph showes the training computational process of `batch_norm_op`:
-
+
cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
@@ -74,13 +74,13 @@ cudnn provides APIs to finish the whole series of computation, we can use them i
`batch_norm_op` is warpped as a layer in Python:
-```python
-def batch_norm_layer(net,
+```python
+def batch_norm_layer(net,
input,
- output,
- scale,
- bias,
- use_global_est = False,
+ output,
+ scale,
+ bias,
+ use_global_est = False,
epsilon = 1e-6,
momentum = 0.99):
mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
@@ -119,15 +119,15 @@ for pass_id in range(PASS_NUM):
if pass_id % 100 == 0:
net.infer(test_image) # run inferencing model
# ...
-```
+```
`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
-

+
-Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
index 21280ac898feb4dd5e5a5d9e88d121e856850f0b..8cd5ff71d193f03e1ac923724b52f28c6057d25d 100644
--- a/doc/fluid/design/modules/regularization.md
+++ b/doc/fluid/design/modules/regularization.md
@@ -6,23 +6,23 @@ A central problem in machine learning is how to design an algorithm that will pe
### Parameter Norm Penalties
Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
-
+
The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
##### L2 Regularization:
-
+
##### L1 Regularization
-
+
A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
## Regularization Survey
-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
## Proposal for Regularization in PaddlePaddle
@@ -32,41 +32,35 @@ In the new design, we propose to create new operations for regularization. For n
- L2_regularization_op
- L1_regularization_op
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
### Computation Graph
Below is an example of a really simple feed forward neural network.
-
+
The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
-
+
### Python API implementation for Regularization
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
#### Creation of Regularization ops
There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph.
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
-The proposal is to add these ops in a lazy manner just before the backward pass.
+The proposal is to add these ops in a lazy manner just before the backward pass.
#### Storage of Regularization attributes
-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
#### High-level API
In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
-
-
-
-
-
-
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
index 7f5dcf55f9f2a0fd27ffde100510dd8fee305381..f32a5b7e8a4d820319a666dab4c3129360e2c924 100644
--- a/doc/fluid/design/network/deep_speech_2.md
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -116,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
- **One** CTC-loss layer
-

+

Figure 1. Archetecture of Deep Speech 2 Network.
@@ -142,7 +142,7 @@ Key ingredients about the layers:
- **Batch Normalization Layers**:
- Added to all above layers (except for data and loss layer).
- Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
-
+
@@ -208,7 +208,7 @@ TODO by Assignees
### Beam Search with CTC and LM
-

+

Figure 2. Algorithm for CTC Beam Search Decoder.
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
index c4a9bbeeefca0e05c335dd60233691e8bac33015..f13d30ca9fe09c9525c711436f605bb280e11000 100644
--- a/doc/fluid/design/network/sequence_decoder.md
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i
## LoD and shape changes during decoding
-
+
According to the image above, the only phase that changes the LoD is beam search.
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
index fb41df8615f73d9fd4c32995eab265833eac1a55..7167470088766985fa5ad31657410309330fd725 100644
--- a/doc/fluid/design/others/gan_api.md
+++ b/doc/fluid/design/others/gan_api.md
@@ -1,24 +1,24 @@
# Design for GAN
-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
-
+
Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
-
+
Figure 2. Photo borrowed from the original DC-GAN paper.
-## The Conditional-GAN might be a class.
+## The Conditional-GAN might be a class.
This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
@@ -29,7 +29,7 @@ This design we adopt the popular open source design in https://github.com/carped
Returns a generated image.
- discriminator(image):
-Given an image, decide if it is from a real source or a fake one.
+Given an image, decide if it is from a real source or a fake one.
Returns a 0/1 binary label.
- build_model(self):
@@ -47,7 +47,7 @@ To be more detailed, we introduce our design of DCGAN as following:
```python
class DCGAN(object):
def __init__(self, y_dim=None):
-
+
# hyper parameters
self.y_dim = y_dim # conditional gan or not
self.batch_size = 100
@@ -82,18 +82,18 @@ class DCGAN(object):
# input z: the random noise
# input y: input data label (optional)
# output G_im: generated fake images
-
+
if not self.y_dim:
z = pd.layer.concat(1, [z, y])
-
+
G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
G_h0_bn = pd.layer.batch_norm(G_h0)
G_h0_relu = pd.layer.relu(G_h0_bn)
-
+
G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
G_h1_bn = pd.layer.batch_norm(G_h1)
G_h1_relu = pd.layer.relu(G_h1_bn)
-
+
G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
G_im = pd.layer.tanh(G_im)
return G_im
@@ -111,11 +111,11 @@ class DCGAN(object):
D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
D_h0_bn = pd.layer.batchnorm(h0)
D_h0_relu = pd.layer.lrelu(h0_bn)
-
+
D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
D_h1_bn = pd.layer.batchnorm(D_h1)
D_h1_relu = pd.layer.lrelu(D_h1_bn)
-
+
D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
return D_h2
```
@@ -123,7 +123,7 @@ class DCGAN(object):
### Class member function: Build the model
- Define data readers as placeholders to hold the data;
- Build generator and discriminators;
-- Define two training losses for discriminator and generator, respectively.
+- Define two training losses for discriminator and generator, respectively.
If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
```python
class DCGAN(object):
@@ -133,7 +133,7 @@ class DCGAN(object):
self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
self.z = pd.data(tf.float32, [None, self.z_size])
-
+
# step 1: generate images by generator, classify real/fake images with discriminator
if self.y_dim: # if conditional GAN, includes label
self.G = self.generator(self.z, self.y)
@@ -147,12 +147,12 @@ class DCGAN(object):
# generate fake images
self.sampled = self.sampler(self.z)
self.D_f = self.discriminator(self.images)
-
+
# step 2: define the two losses
self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
self.d_loss = self.d_loss_real + self.d_loss_fake
-
+
self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
```
@@ -176,7 +176,7 @@ class DCGAN(object):
self.G = self.generator(self.z)
self.D_g = self.discriminator(self.G, self.y)
self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-
+
with pd.default_block().d_block():
if self.y_dim: # if conditional GAN, includes label
self.D_t = self.discriminator(self.images, self.y)
@@ -217,7 +217,7 @@ if __name__ == "__main__":
# load mnist data
data_X, data_y = self.load_mnist()
-
+
# Two subgraphs required!!!
with pd.block().d_block():
d_optim = pd.train.Adam(lr = .001, beta= .1)
@@ -228,7 +228,7 @@ if __name__ == "__main__":
# executor
sess = pd.executor()
-
+
# training
for epoch in xrange(10000):
for batch_id in range(N / batch_size):
@@ -239,7 +239,7 @@ if __name__ == "__main__":
batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
if batch_id % 2 == 0:
- sess.run(d_step,
+ sess.run(d_step,
feed_dict = {dcgan.images: batch_im,
dcgan.y: batch_label,
dcgan.z: batch_z})
diff --git a/doc/fluid/dev/releasing_process.md b/doc/fluid/dev/releasing_process.md
index 0810765b85f73d9dba876e66fb43bb1ad476d6d2..c5943ccd81c2ae2aaacd2676da12509db889f54a 100644
--- a/doc/fluid/dev/releasing_process.md
+++ b/doc/fluid/dev/releasing_process.md
@@ -37,7 +37,7 @@ PaddlePaddle每次发新的版本,遵循以下流程:
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法
使用`twine`工具上传即可。
-
+
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
index b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20..ee96e7c74ce317caddb387cbb1d4998937bd5c81 100644
--- a/doc/fluid/howto/performance/profiler.md
+++ b/doc/fluid/howto/performance/profiler.md
@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program? There many C++ A
The overall flow is shown as the following figure.
-
+
### Event
@@ -36,10 +36,10 @@ enum EventKind {
kPopRange};
```
- kMark: only a marker without time range.
-- kPushRange: mark the starting event for time range.
+- kPushRange: mark the starting event for time range.
- kPopRange: mark the ending event for time range.
-For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece.
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece.
```c++
class Event {
@@ -66,11 +66,11 @@ struct EventList {
};
```
-As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
```c++
enum ProfilerState {
- kDisabled,
+ kDisabled,
kCPU,
kCUDA
};
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+ rnn [label="1st level RNN" shape=box]
+
+ subgraph cluster0 {
+ label = "time step 0"
+
+ sent0 [label="sentence"]
+ sent1 [label="sentence"]
+
+ rnn1 [label="2nd level RNN" shape=box]
+
+ sent0 -> rnn1
+ sent1 -> rnn1
+ }
+
+ subgraph cluster1 {
+ label = "time step 1"
+
+ sent2 [label="sentence"]
+ sent3 [label="sentence"]
+
+ rnn2 [label="2nd level RNN" shape=box]
+
+ sent2 -> rnn2
+ sent3 -> rnn2
+ }
+
+ subgraph cluster2 {
+ label = "time step 2"
+
+ sent4 [label="sentence"]
+ sent5 [label="sentence"]
+
+ rnn3 [label="2nd level RNN" shape=box]
+
+ sent4 -> rnn3
+ sent5 -> rnn3
+ }
+
+
+ para0 [label="paragraph info 0"]
+ para1 [label="paragraph info 1"]
+ para2 [label="paragraph info 2"]
+
+ rnn1 -> para0
+ rnn2 -> para1
+ rnn3 -> para2
+
+ para0 -> rnn
+ para1 -> rnn
+ para2 -> rnn
+
+ chapter [label="chapter info"]
+ rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+ subgraph cluster_before {
+ Prev [label="...", shape=plaintext];
+ Rnn [label="rnn_op", shape=box];
+ BatchNorm [label="batch_norm_op", shape=box];
+ Fc [label="fc_op", shape=box];
+ After [label="...", shape=plaintext];
+ Prev -> Rnn -> BatchNorm -> Fc -> After;
+ label="original";
+ }
+
+ subgraph cluster_after {
+ Prev2 [label="...", shape=plaintext];
+ Rnn2 [label="rnn_op", shape=box];
+ BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+ BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+ Fc2_1 [label="fc_op", shape=box];
+ Fc2_2 [label="fc_op", shape=box];
+ After2_1 [label="...", shape=plaintext];
+ After2_2 [label="...", shape=plaintext];
+ Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+ Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+ label="forked";
+ }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+ sed 's/color=red/color=red, style=invis/g' | \
+ sed 's/color=green/color=green, style=invis/g' | \
+ dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+ sed 's/color=green/color=green, style=invis/g' | \
+ dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+ dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+ ///////// The forward part /////////
+ FeedX [label="Feed", color=blue, shape=box];
+ FeedY [label="Feed", color=blue, shape=box];
+ InitW [label="Init", color=blue, shape=diamond];
+ Initb [label="Init", color=blue, shape=diamond];
+ FC [label="FC", color=blue, shape=box];
+ MSE [label="MSE", color=blue, shape=box];
+
+ x [label="x", color=blue, shape=oval];
+ l [label="l", color=blue, shape=oval];
+ y [label="y", color=blue, shape=oval];
+ W [label="W", color=blue, shape=doublecircle];
+ b [label="b", color=blue, shape=doublecircle];
+ cost [label="cost", color=blue, shape=oval];
+
+ FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+ FeedY -> l [color=blue];
+ InitW -> W [color=blue];
+ Initb -> b [color=blue];
+ W -> FC [color=blue];
+ b -> FC [color=blue];
+ l -> MSE [color=blue];
+
+ ////////// The backward part /////////
+ MSE_Grad [label="MSE_grad", color=red, shape=box];
+ FC_Grad [label="FC_grad", color=red, shape=box];
+
+ d_cost [label="d cost", color=red, shape=oval];
+ d_y [label="d y", color=red, shape=oval];
+ d_b [label="d b", color=red, shape=oval];
+ d_W [label="d W", color=red, shape=oval];
+
+ cost -> MSE_Grad [color=red];
+ d_cost -> MSE_Grad [color=red];
+ l -> MSE_Grad [color=red];
+ y -> MSE_Grad -> d_y [color=red];
+
+ x -> FC_Grad [color=red];
+ y -> FC_Grad [color=red];
+ d_y -> FC_Grad [color=red];
+ W -> FC_Grad -> d_W [color=red];
+ b -> FC_Grad -> d_b [color=red];
+
+ ////////// The optimizaiton part //////////
+
+ OPT_W [label="SGD", color=green, shape=box];
+ OPT_b [label="SGD", color=green, shape=box];
+
+ W -> OPT_W [color=green];
+ b -> OPT_b [color=green];
+ d_W -> OPT_W -> W [color=green];
+ d_b -> OPT_b -> b [color=green];
+
+ ////////// Groupings //////////
+
+ subgraph clusterMSE {
+ style=invis;
+ MSE;
+ MSE_Grad;
+ }
+
+ subgraph clusterFC {
+ style=invis;
+ FC;
+ FC_Grad;
+ }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+ label = "simple RNN implementation"
+
+ ranksep=2;
+
+ //graph [nodesep=1, ranksep=1];
+
+ node[nodesep=1]
+
+ subgraph cluster0 {
+ label = "global scope"
+ rankdir = TB
+ W
+ boot_memory
+ input
+ output
+ }
+
+ subgraph cluster1 {
+ label = "step-scope 0"
+ rankdir = TB
+ memory0[label="memory"]
+ prememory0[label="pre-memory"]
+ step_input0[label="step input"]
+ step_output0[label="step output"]
+ }
+
+ subgraph cluster2 {
+ label = "step-scope 1"
+ rankdir = TB
+ memory1[label="memory"]
+ prememory1[label="pre-memory"]
+ step_input1[label="step input"]
+ step_output1[label="step output"]
+ }
+
+ subgraph cluster3 {
+ label = "step-scope 2"
+ rankdir = TB
+ memory2[label="memory"]
+ prememory2[label="pre-memory"]
+ step_input2[label="step input"]
+ step_output2[label="step output"]
+ }
+
+ stepnet [shape=box]
+ stepnet0 [shape=box, style=dashed]
+ stepnet1 [shape=box, style=dashed]
+ stepnet2 [shape=box, style=dashed]
+
+
+ edge[color=blue]
+ boot_memory -> prememory0 [label="init" color="blue"]
+ memory0 -> prememory1 [label="copy/reference" color="blue"]
+ memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+ edge[color=black]
+ W -> stepnet0[constraint=false, style=dashed]
+ W -> stepnet1[constraint=false, style=dashed]
+ W -> stepnet2[constraint=false, style=dashed]
+
+ memory0 -> stepnet0[style=dashed]
+ prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+ memory1 -> stepnet1[style=dashed]
+ prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+ memory2 -> stepnet2[style=dashed]
+ prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+ input -> step_input0
+ input -> step_input1
+ input -> step_input2
+
+ step_input0 -> stepnet0 [style=dashed]
+ step_input1 -> stepnet1[style=dashed]
+ step_input2 -> stepnet2[style=dashed]
+
+ step_output0 -> output
+ step_output1 -> output
+ step_output2 -> output
+
+ stepnet0 -> stepnet[style=dashed]
+ stepnet1 -> stepnet[style=dashed]
+ stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+ chapter [label="chapter"]
+
+ subgraph cluster0 {
+ label = "paragraph 0"
+
+ top_rnn0[label="top rnn step 0" shape=box]
+
+ p0 [label="paragraph 0"]
+ p1 [label="paragraph 1"]
+ }
+
+ subgraph cluster1{
+ label = "paragraph 1"
+
+ top_rnn1[label="top rnn step 1" shape=box]
+
+ p2 [label="paragraph 0"]
+ p3 [label="paragraph 1"]
+ }
+
+ subgraph cluster_p0 {
+ label = "sentence 0"
+
+ low_rnn0 [label="low rnn step 0" shape=box]
+ s00 [label="sentence 0"]
+ s01 [label="sentence 1"]
+
+ low_rnn0 -> s00
+ low_rnn0 -> s01
+ }
+
+ subgraph cluster_p1 {
+ label = "sentence 1"
+ low_rnn1 [label="low rnn step 1" shape=box]
+ s10 [label="sentence 0"]
+ s11 [label="sentence 1"]
+ low_rnn1 -> s10
+ low_rnn1 -> s11
+ }
+
+ subgraph cluster_p2 {
+ label = "sentence 1"
+ low_rnn2 [label="low rnn step 0" shape=box]
+ s20 [label="sentence 0"]
+ s21 [label="sentence 1"]
+ low_rnn2 -> s20
+ low_rnn2 -> s21
+ }
+
+ subgraph cluster_p3 {
+ label = "sentence 1"
+ low_rnn3 [label="low rnn step 1" shape=box]
+ s30 [label="sentence 0"]
+ s31 [label="sentence 1"]
+ low_rnn3 -> s30
+ low_rnn3 -> s31
+ }
+
+
+ chapter -> top_rnn0
+ chapter -> top_rnn1
+
+ top_rnn0 -> p0
+ top_rnn0 -> p1
+ top_rnn1 -> p2
+ top_rnn1 -> p3
+
+
+ p0 -> low_rnn0
+ p1 -> low_rnn1
+ p2 -> low_rnn2
+ p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+ z -> generator -> G_img;
+ G_img -> discriminator -> D_f -> d_loss_f;
+ label0 -> d_loss_f -> d_loss;
+
+ img -> discriminator -> D_t -> d_loss_t;
+ label1 -> d_loss_t -> d_loss;
+
+ d_loss -> d_loss_t[color=red, style=dashed];
+ d_loss -> d_loss_f[color=red, style=dashed];
+ d_loss_t -> D_t[color=red, style=dashed];
+ d_loss_f -> D_f[color=red, style=dashed];
+ D_t -> discriminator[color=red, style=dashed];
+ D_f -> discriminator[color=red, style=dashed];
+
+ D_f -> g_loss;
+ label2 -> g_loss;
+
+ g_loss -> D_f[color=green, style=dashed];
+ D_f -> discriminator[color=green, style=dashed];
+ discriminator -> G_img[color=green, style=dashed];
+ G_img -> generator[color=green, style=dashed];
+
+ discriminator [color=red, shape=box];
+ generator [color=green, shape=box];
+ z [shape=diamond];
+ img [shape=diamond];
+ label0 [shape=diamond];
+ label1 [shape=diamond];
+ label2 [shape=diamond];
+
+ d_loss [color=red];
+ g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 260b6c9fd1b364433cae098bacea77aa7fe9e266..76b82fd97f1ed642696c4414676b694ebda9ad81 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
# serve to show the default.
import sys
import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e5757b86b43001bc6090d8edd0aaa5ff4fc476ee..5aa5c1381fa3fad4ebc181c7868da03ae0138016 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
# serve to show the default.
import sys
import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 82de7a3a3e1ca7724e1eda877d53454a4fa4129a..be957d37b14c618e9346251b3bd3dbaf1541773f 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_docs gen_proto_py)
+add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
-add_dependencies(paddle_v2_docs_cn gen_proto_py)
+add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index da1eafc02ed8cd155d4f0f1fbadcb7b237b6fcc1..2670a21a227546ffcee4f10f395feef3c58df9b4 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
-add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind)
+add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
index d264b0a9f85faffd49c1982117cb5a3ac6ffc015..de6b60f29eb97029a54609cd2194bb7faf3ffec5 100644
--- a/doc/v2/howto/rnn/recurrent_group_en.md
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -1,3 +1,96 @@
# Recurrent Group Tutorial
-TBD
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: Layers for supporting double-layer sequences as input.
+
+## Related Concepts
+
+### Basic Principle
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN: each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+ - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+ - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+ - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+ - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+
+- **Double-input Double-output**: Both input and output are two-level sequence.
+ - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+ - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+ - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+ - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+
+### Usage of Generation Process
+Using `beam_search` need follow those conventions:
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index cf84568ecdf1227b0d0ed3606a4a9a6e5186af72..06e1f5d5f0884efabfcdf917ca5c35d94ad5dce9 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
${START_END}
)
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
- COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
- COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
- COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+ COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+ COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+ COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
DEPENDS _swig_paddle
)
# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
if(WITH_TESTING)
IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 761aeb5b174105edece8880a9f5012c13a63fd11..13cb79129cc2272d215cdb475fb146b37266699e 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,3 +1,8 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+ COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
py_test(testTrain SRCS testTrain.py)
py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 63ec51564793ca2255032d0efbe2c47326f8b698..b790fa39fe863bbb00f6cd36d4c63481b7634fe1 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t groups);
+/**
+ * @brief Upsample forward.
+ * @param[in] inputData input data.
+ * @param[out] maskData the mask data from MaxPoolWithMaskLayer.
+ * @param[out] batchSize the batch size of the input.
+ * @param[in] imgSizeH image height.
+ * @param[in] imgSizeW image width.
+ * @param[in] channels the input channels.
+ * @param[in] outputH the output height.
+ * @param[in] outputW the output widht.
+ * @param[out] outputData output data.
+ */
+extern void hl_upsample_forward(real* inputData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* outputData);
+
+/**
+ * @brief Upsample backward.
+ * @param[in] outputGradData the output grad data.
+ * @param[out] maskData the mask data from MaxPoolWithMaskLayer.
+ * @param[out] batchSize the batch size of the input.
+ * @param[in] imgSizeH image height.
+ * @param[in] imgSizeW image width.
+ * @param[in] channels the input channels.
+ * @param[in] outputH the output height.
+ * @param[in] outputW the output widht.
+ * @param[out] inputGradData the input grad data.
+ */
+extern void hl_upsample_backward(real* outputGradData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* inputGradData);
+
#endif // HL_CNN_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index c39bd3228d3f2ea7495cd21f5ff60bdfbbd2b51d..997eed62e07827f375c7441554b397fdd0bd6a80 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t group) {}
+inline void hl_upsample_forward(real* inputData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* outputData) {}
+
+inline void hl_upsample_backward(real* outputGradData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* inputGradData) {}
+
#endif // HL_CNN_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index a4459243e8a7c8be58be2255faf89e29817fbdf5..bac743a293cc97b114281e510d06367a86536452 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad,
num_kernels, inGrad, outGrad, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_backward failed");
}
+
+__global__ void upsampleForwardCompute(real* input_data,
+ real* mask_data,
+ size_t nthreads,
+ size_t in_h,
+ size_t in_w,
+ size_t out_h,
+ size_t out_w,
+ real* output_data) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < nthreads) {
+ int offset = index / (in_w * in_h) * out_h * out_w;
+ int upsample_idx = static_cast(mask_data[index]);
+ output_data[offset + upsample_idx] = input_data[index];
+ }
+}
+
+__global__ void upsampleBackwardCompute(real* out_grad,
+ real* mask_data,
+ size_t nthreads,
+ size_t in_h,
+ size_t in_w,
+ size_t out_h,
+ size_t out_w,
+ real* input_grad) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ if (index < nthreads) {
+ int offset = index / (in_w * in_h) * out_h * out_w;
+ int upsample_idx = static_cast(mask_data[index]);
+ input_grad[index] = out_grad[offset + upsample_idx];
+ }
+}
+
+void hl_upsample_forward(real* inputData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* outputData) {
+ int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+ int blocks = (num_kernels + 1024 - 1) / 1024;
+ upsampleForwardCompute<<>>(inputData,
+ maskData,
+ num_kernels,
+ imgSizeH,
+ imgSizeW,
+ outputH,
+ outputW,
+ outputData);
+ CHECK_SYNC("hl_upsample_forward failed");
+}
+
+void hl_upsample_backward(real* outputGradData,
+ real* maskData,
+ size_t batchSize,
+ size_t imgSizeH,
+ size_t imgSizeW,
+ size_t channels,
+ size_t outputH,
+ size_t outputW,
+ real* inputGradData) {
+ int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+ int blocks = (num_kernels + 1024 - 1) / 1024;
+ upsampleBackwardCompute<<>>(outputGradData,
+ maskData,
+ num_kernels,
+ imgSizeH,
+ imgSizeW,
+ outputH,
+ outputW,
+ inputGradData);
+ CHECK_SYNC("hl_upsample_backward failed");
+}
diff --git a/paddle/fluid/framework/.clang-format b/paddle/fluid/.clang-format
similarity index 100%
rename from paddle/fluid/framework/.clang-format
rename to paddle/fluid/.clang-format
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c425c71160a8fa3830a5fbdae1baaed850710877..a473ed7400012b7d0cbc5ab9bed263b3cca8c6ec 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
- COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+ COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+ COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 468423e0e8e7b8c9ebc14b7568c9c3bd21645ea7..873969b2a884f6d9e133fe87bf72725c36ce8b98 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
#include
#include
#include
+#include
#include
#include
@@ -96,6 +97,8 @@ class BlockDesc {
*/
void RemoveOp(size_t s, size_t e);
+ void RemoveVar(const std::string &name) { vars_.erase(name); }
+
std::vector AllOps() const;
size_t OpSize() const { return ops_.size(); }
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 019bea600f496a6b58579ad0aa8af836cd6134a9..722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once
-#include // for size_t
-#include
+#include // for size_t
+#include // NOLINT
#include
#include "paddle/fluid/platform/enforce.h"
@@ -216,7 +216,8 @@ class ChannelHolder {
template
struct PlaceholderImpl : public Placeholder {
- PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+ explicit PlaceholderImpl(size_t buffer_size)
+ : type_(std::type_index(typeid(T))) {
channel_.reset(MakeChannel(buffer_size));
}
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index e056779ea0dd0a31191b628f82724298efaf50ff..26d454534e1ae38c4f83376c0836a45781ea9101 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include // for size_t
#include
-#include
+#include // NOLINT
#include
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel {
virtual void Unlock();
virtual bool IsClosed();
virtual void Close();
- ChannelImpl(size_t);
+ explicit ChannelImpl(size_t);
virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel {
const void *referrer; // TODO(thuan): figure out better way to do this
std::function callback;
- QueueMessage(T *item)
+ explicit QueueMessage(T *item)
: data(item), cond(std::make_shared()) {}
QueueMessage(T *item, std::shared_ptr cond)
@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel {
}
std::shared_ptr get_first_message(
- std::deque> &queue, ChannelAction action) {
- while (!queue.empty()) {
+ std::deque> *queue, ChannelAction action) {
+ while (!queue->empty()) {
// Check whether this message was added by Select
// If this was added by Select then execute the callback
// to check if you can execute this message. The callback
// can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next.
- std::shared_ptr m = queue.front();
- queue.pop_front();
+ std::shared_ptr m = queue->front();
+ queue->pop_front();
if (m->callback == nullptr || m->callback(action)) return m;
}
return nullptr;
@@ -147,7 +147,7 @@ void ChannelImpl::Send(T *item) {
// to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) {
std::shared_ptr m =
- get_first_message(recvq, ChannelAction::SEND);
+ get_first_message(&recvq, ChannelAction::SEND);
if (m != nullptr) {
*(m->data) = std::move(*item);
@@ -198,7 +198,7 @@ bool ChannelImpl::Receive(T *item) {
// buffer and move front of send queue to the buffer
if (!sendq.empty()) {
std::shared_ptr m =
- get_first_message(sendq, ChannelAction::RECEIVE);
+ get_first_message(&sendq, ChannelAction::RECEIVE);
if (buf_.size() > 0) {
// Case 1 : Channel is Buffered
// Do Data transfer from front of buffer
@@ -219,8 +219,9 @@ bool ChannelImpl::Receive(T *item) {
if (m != nullptr) {
*item = std::move(*(m->data));
m->Notify();
- } else
+ } else {
return recv_return(Receive(item));
+ }
}
return recv_return(true);
}
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index 1184bfdae1940286fb72d9091ae4f23ff7f84a54..542d791f6bbdf7d68a4786998ccc0233fff6473d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/channel.h"
-#include
-#include
+#include // NOLINT
+#include // NOLINT
#include "gtest/gtest.h"
using paddle::framework::Channel;
@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
- if (i < buffer_size)
+ if (i < buffer_size) {
ch->Send(&i); // should block after 10 iterations
- else {
+ } else {
bool is_exception = false;
try {
ch->Send(&i);
@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
}
void ChannelCloseUnblocksReceiversTest(Channel *ch) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
- bool send_success[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
+ bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a buffered channel also unblocks
@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
- bool send_success[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
+ bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successful sends
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) {
}
// Join all threads
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channel also unblocks
// any receivers waiting on the channel
void ChannelDestroyUnblockReceivers(Channel *ch) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
}
void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
- bool send_success[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
+ bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a channelholder unblocks
@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
// This tests that destroying a channelholder unblocks
// any senders waiting for channel
void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
- bool send_success[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
+ bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
int ct = 0;
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
}
// Join all threads
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channelholder also unblocks
// any receivers waiting on the channel
void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
- size_t num_threads = 5;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
+ const size_t kNumThreads = 5;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads are blocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
// This tests that closing a channelholder many times.
void ChannelHolderManyTimesClose(ChannelHolder *ch) {
- const int num_threads = 15;
- std::thread t[num_threads];
- bool thread_ended[num_threads];
+ const int kNumThreads = 15;
+ std::thread t[kNumThreads];
+ bool thread_ended[kNumThreads];
// Launches threads that try to send data to channel.
- for (size_t i = 0; i < num_threads / 3; i++) {
+ for (size_t i = 0; i < kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *ended) {
@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
}
// Launches threads that try to receive data to channel.
- for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
+ for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
}
// Launches threads that try to close the channel.
- for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
+ for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are unblocked
- for (size_t i = 0; i < num_threads; i++) {
+ for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
EXPECT_TRUE(ch->IsClosed());
// delete the channel
delete ch;
- for (size_t i = 0; i < num_threads; i++) t[i].join();
+ for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index bf1a705ef50b663efa53393ead1f81fd6bcf8c48..89b5c6847f15b3f2a270fe1e7db9e590549e8982 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -16,6 +16,6 @@ else()
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index c277bd7cb69bba899296efe64107ee538c4aa847..128a5344fbb8c64c36ade24475bd0d99bdb3e0f5 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -21,6 +21,9 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
+#include
+#include
+
namespace paddle {
namespace framework {
namespace details {
@@ -168,6 +171,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build(
*/
PolishGraphToSupportDataHazards(&result);
+ /*
+ * Only variables should be the leaves of graph.
+ */
+ AddOutputToLeafOps(&result);
+
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
index 361ba6d39721eed406a30fea325b3b4508ec45d0..0a4febd22f3feefdcac99cafc2cb58269380d192 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.cc
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
sout << "}\n";
}
+
+void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
+ for (auto &op : graph->ops_) {
+ if (!op->outputs_.empty()) {
+ continue;
+ }
+ auto *dummy_leaf = new DummyVarHandle();
+ graph->dep_vars_.emplace(dummy_leaf);
+ op->AddOutput(dummy_leaf);
+ }
+}
} // namespace details
} // namespace framework
} // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
index bf20e7164a100718c1dcfe3ef971cfff60bbbaa2..be1f0460e45402806b18835f054a7195df1374cc 100644
--- a/paddle/fluid/framework/details/ssa_graph_builder.h
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -14,13 +14,13 @@
#pragma once
+#include
+#include
+
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
-#include
-#include
-
namespace paddle {
namespace framework {
namespace details {
@@ -52,6 +52,8 @@ class SSAGraphBuilder {
const std::string &each_var_name,
const platform::Place &place, size_t place_offset);
+ static void AddOutputToLeafOps(SSAGraph *graph);
+
static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
};
} // namespace details
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 1f96b9dc6235a18f7566c98cca60baa964e6aa56..596e5731868630cebc3cf51b2e78d4deb39a9b33 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Step 2. Insert FetchOps
std::vector> fetch_ops;
- std::vector dummy_vars;
FeedFetchList fetch_data(fetch_tensors.size());
std::unordered_map> fetched_vars;
@@ -101,13 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
+ std::unordered_set> fetch_dependencies;
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
auto &var_name = fetch_tensors[i];
auto &vars = fetched_vars.at(var_name);
auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
fetch_ops.emplace_back(op);
- // FIXME: Use new device context
for (auto &p : places_) {
op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
}
@@ -115,6 +114,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto *var : vars) {
op->AddInput(var);
}
+
+ auto *fetch_dummy = new DummyVarHandle();
+ op->AddOutput(fetch_dummy);
+ fetch_dependencies.emplace(fetch_dummy);
+ InsertPendingVar(*fetch_dummy);
InsertPendingOp(*op);
}
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 64c06687b6b905186d4efcc8441d3abef6323d53..16a118090ba9cfd50b4b03484983f9fc73cf7973 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -279,6 +279,21 @@ std::unique_ptr Executor::Prepare(
return std::unique_ptr(ctx);
}
+std::vector> Executor::Prepare(
+ const ProgramDesc& program, const std::vector& block_ids) {
+ std::vector> result;
+ for (auto& bid : block_ids) {
+ auto* ctx = new ExecutorPrepareContext(program, bid);
+ PADDLE_ENFORCE_LT(static_cast(bid), program.Size());
+ auto& block = program.Block(bid);
+ for (auto& op_desc : block.AllOps()) {
+ ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
+ }
+ result.push_back(std::shared_ptr(ctx));
+ }
+ return result;
+}
+
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 7173c51c95e04ad3095f01bb24923a7a3341c517..d7c99165f0c9d3b1ae11a3b4753a61e8118f7b52 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -61,6 +61,9 @@ class Executor {
static std::unique_ptr Prepare(
const ProgramDesc& program, int block_id);
+ static std::vector> Prepare(
+ const ProgramDesc& program, const std::vector& block_ids);
+
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index dee505fee0dccd8d60bb290a8bec4df243e504a2..4f130d265900483ec7a7c541f2610d17a352913f 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1;
}
+ // Split LoDTensor and copy to each place specified in places.
std::vector SplitLoDTensor(
const std::vector places) const;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f6a43804ef2fd73c4a2c2c3b3dfbb90bff1c451b..a3b4a8c0829ae3324e933309b2eaea35fe571997 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -35,6 +35,17 @@ std::vector> kKernelPriority = {
std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
};
+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+ if (var->IsType()) {
+ return framework::ToDataType(var->Get().type());
+ } else if (var->IsType()) {
+ return framework::ToDataType(
+ var->Get().value().type());
+ } else {
+ PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+ }
+}
+
static DDim GetDims(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 41214b41cb68cbd7049552f39195ae5257e0d06f..b7a7c69b4c8493f945926c75797c49d327a3197e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix;
}
+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+
class OperatorBase;
class ExecutionContext;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 17885143247f0e0db8f12931e3c3412e7114ef3d..7be93fa6002ae93c3e1b75c8f7fe5ca5f40b271f 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -150,13 +150,30 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
}
-void ParallelExecutor::Run(const std::vector &fetch_tensors,
- const std::string &fetched_var_name) {
+void ParallelExecutor::Run(
+ const std::vector &fetch_tensors,
+ const std::string &fetched_var_name,
+ const std::unordered_map &feed_tensors) {
platform::RecordBlock b(0);
+ SplitTensorToPlaces(feed_tensors);
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable() =
fetch_data;
}
+void ParallelExecutor::SplitTensorToPlaces(
+ const std::unordered_map &feed_tensors) {
+ for (auto it : feed_tensors) {
+ auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
+ for (size_t j = 0; j < member_->places_.size(); ++j) {
+ // TODO(panxy0718): Do I need to delete this var?
+ member_->local_scopes_[j]
+ ->Var(it.first)
+ ->GetMutable()
+ ->ShareDataWith(lod_tensors[j]);
+ }
+ }
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 964b476234e622cae934d41bc3793bc3114a5f1a..c7c58b2b808383621a6d492f9188b0d36bfa6858 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -42,9 +42,13 @@ class ParallelExecutor {
bool allow_op_delay);
void Run(const std::vector& fetch_tensors,
- const std::string& fetched_var_name = "fetched_var");
+ const std::string& fetched_var_name,
+ const std::unordered_map& feed_tensors);
private:
+ void SplitTensorToPlaces(
+ const std::unordered_map& feed_tensors);
+
ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 504344e937dfdc362cdc22298a5f963d87011e9d..d9d6b7dd67f1c6e4bbd6a4e1a8f0843d4cb93c05 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
+
http://www.apache.org/licenses/LICENSE-2.0
+
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
+
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 9458d56a01df432aea573d796456b9be31350038..8e2d9470d3954e0f66c74828a8d8292c2875a8f4 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
+
http://www.apache.org/licenses/LICENSE-2.0
+
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -47,6 +50,15 @@ class SelectedRows {
void set_rows(const Vector& rows) { rows_ = rows; }
+ /**
+ * get the index of id in rows
+ */
+ int64_t index(int64_t id) const {
+ auto it = std::find(rows_.begin(), rows_.end(), id);
+ PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
+ return static_cast(std::distance(rows_.begin(), it));
+ }
+
DDim GetCompleteDims() const {
std::vector dims = vectorize(value_->dims());
dims[0] = height_;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f7a6b5ba84ca1762bd903790aa3c0346b22ed035..6f878541e6de1deec1829145b1b325ecd176a034 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -45,11 +45,10 @@ class Tensor {
friend struct EigenVector;
public:
- Tensor() : offset_(0), is_pinned_(false) {}
+ Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
- explicit Tensor(const platform::Place& place)
- : offset_(0), is_pinned_(false) {
+ explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
@@ -70,12 +69,11 @@ class Tensor {
* @note If not exist, then allocation.
*/
template
- inline T* mutable_data(platform::Place place, bool is_pinned = false);
+ inline T* mutable_data(platform::Place place);
- inline void* mutable_data(platform::Place place, std::type_index type,
- bool is_pinned = false);
+ inline void* mutable_data(platform::Place place, std::type_index type);
- inline void* mutable_data(platform::Place place, bool is_pinned = false);
+ inline void* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
@@ -86,8 +84,7 @@ class Tensor {
* @note If not exist, then allocation.
*/
template
- inline T* mutable_data(DDim dims, platform::Place place,
- bool is_pinned = false);
+ inline T* mutable_data(DDim dims, platform::Place place);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
@@ -95,9 +92,6 @@ class Tensor {
/*! Return the numel of the memory block. */
inline int64_t numel() const;
- /*! Return the numel of the memory block. */
- inline bool isPinned() const;
-
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
@@ -152,14 +146,12 @@ class Tensor {
template
struct PlaceholderImpl : public Placeholder {
- PlaceholderImpl(Place place, size_t size, std::type_index type,
- bool is_pinned = false)
- : ptr_(static_cast(memory::Alloc(place, size, is_pinned)),
- memory::PODDeleter(place, is_pinned)),
+ PlaceholderImpl(Place place, size_t size, std::type_index type)
+ : ptr_(static_cast(memory::Alloc(place, size)),
+ memory::PODDeleter(place)),
place_(place),
size_(size),
- type_(type),
- is_pinned_(is_pinned) {
+ type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
@@ -182,9 +174,6 @@ class Tensor {
/* the current type of memory */
std::type_index type_;
-
- /*! use pinned memory or not. */
- bool is_pinned_;
};
/*! holds the memory block if allocated. */
@@ -219,7 +208,6 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins.
*/
size_t offset_;
- bool is_pinned_;
};
inline void Tensor::switch_place(platform::Place new_place) {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 113814971e115fa88bd0ded34017fa26a9dd5803..f49d1a47a325b2aac6185073203df124be18b54d 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -101,21 +101,19 @@ inline T* Tensor::data() {
}
template
-inline T* Tensor::mutable_data(DDim dims, platform::Place place,
- bool is_pinned) {
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
static_assert(std::is_pod::value, "T must be POD");
Resize(dims);
- return mutable_data(place, is_pinned);
+ return mutable_data(place);
}
template
-inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod::value, "T must be POD");
- return reinterpret_cast(mutable_data(place, typeid(T), is_pinned));
+ return reinterpret_cast(mutable_data(place, typeid(T)));
}
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
- bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
@@ -129,27 +127,33 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl(
- boost::get(place), size, type, is_pinned));
- } else if (platform::is_gpu_place(place)) {
+ boost::get(place), size, type));
+ } else if (platform::is_gpu_place(place) ||
+ platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA
- PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+ PADDLE_THROW(
+ "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
}
#else
- holder_.reset(new PlaceholderImpl(
- boost::get(place), size, type, is_pinned));
+ if (platform::is_gpu_place(place)) {
+ holder_.reset(new PlaceholderImpl(
+ boost::get(place), size, type));
+ } else if (platform::is_cuda_pinned_place(place)) {
+ holder_.reset(new PlaceholderImpl(
+ boost::get(place), size, type));
+ }
}
#endif
offset_ = 0;
- is_pinned_ = is_pinned;
}
return reinterpret_cast(reinterpret_cast(holder_->ptr()) +
offset_);
}
-inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
+inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
- "Cannot invoke mutable data if current hold nothing");
- return mutable_data(place, holder_->type(), is_pinned);
+ "Cannot invoke mutable data if current hold nothing.");
+ return mutable_data(place, holder_->type());
}
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
@@ -191,8 +195,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
-inline bool Tensor::isPinned() const { return is_pinned_; }
-
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 8b7533ce712b0a01060842b6f71449ed6bd23e2c..1d864af011bced9df188147ec436b8de12947ba9 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor {
const platform::CPUPlace& cpu) const {
return *out.data();
}
+
+ bool GetResult(const framework::Tensor& out,
+ const platform::CUDAPinnedPlace& cpu) const {
+ return *out.data();
+ }
};
template
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 78996908b18a5a0935d8de9920e8ccef9069e74b..f6c6a1fec13d8b12efd1fa71a7a93316e484d045 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
public:
using ElementVars = std::vector;
- Tuple(std::vector& var, std::vector& var_desc)
+ Tuple(const std::vector& var,
+ const std::vector& var_desc)
: var_(var), var_desc_(var_desc) {}
- Tuple(std::vector& var) : var_(var) {}
+ explicit Tuple(std::vector& var) : var_(var) {}
- ElementVar get(int idx) const { return var_[idx]; };
+ ElementVar get(int idx) const { return var_[idx]; }
- ElementVar& get(int idx) { return var_[idx]; };
+ ElementVar& get(int idx) { return var_[idx]; }
- bool isSameType(Tuple& t) const;
+ bool isSameType(const Tuple& t) const;
- size_t getSize() const { return var_.size(); };
+ size_t getSize() const { return var_.size(); }
private:
ElementVars var_;
std::vector var_desc_;
};
-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
size_t tuple_size = getSize();
if (tuple_size != t.getSize()) {
return false;
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2..a5b62ef322bfad0fc956d7d722797bd5add6aea6 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return false;
}
-void LoadPersistables(framework::Executor& executor,
- framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename) {
@@ -108,10 +107,8 @@ std::unique_ptr Load(framework::Executor& executor,
}
std::unique_ptr Load(
- framework::Executor& executor,
- framework::Scope& scope,
- const std::string& prog_filename,
- const std::string& param_filename) {
+ framework::Executor& executor, framework::Scope& scope,
+ const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 6817a6fca047c9336233697a7bee4e5e16eedd5e..d07d315b93ef10a464080899b1cb9920abe83be3 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -24,8 +24,7 @@ limitations under the License. */
namespace paddle {
namespace inference {
-void LoadPersistables(framework::Executor& executor,
- framework::Scope& scope,
+void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename);
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index e7ffb00ec8d8926193fe510ebdb7185f75c90906..6ed77adb9d891c75e7de358d0d7a0c06c9af96dd 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
- set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+ set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec3abdb86724fb16725962958c5cf55c..3e77dc166c355bc141563eda4705ca8d75226ac4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
// The second dim of the input tensor should be 13
// The input data should be >= 0
int64_t batch_size = 10;
- SetupTensor(
- input, {batch_size, 13}, static_cast(0), static_cast(10));
+ SetupTensor(&input, {batch_size, 13}, static_cast(0),
+ static_cast(10));
std::vector cpu_feeds;
cpu_feeds.push_back(&input);
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index e9a27171f1cd68e7b10c860fb4a1417b930ed565..a6b6c3f828f4c6f59fca42e4c3d9580d6c136524 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
- SetupTensor(input,
- {FLAGS_batch_size, 3, 32, 32},
- static_cast(0),
- static_cast(1));
+ SetupTensor(&input, {FLAGS_batch_size, 3, 32, 32},
+ static_cast(0), static_cast(1));
std::vector cpu_feeds;
cpu_feeds.push_back(&input);
@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---";
- TestInference(
- dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
+ TestInference(dirname, cpu_feeds, cpu_fetchs1,
+ FLAGS_repeat);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---";
- TestInference(
- dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+ TestInference(dirname, cpu_feeds, cpu_fetchs2,
+ FLAGS_repeat);
LOG(INFO) << output2.dims();
CheckError(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 184924016634bba26204d937744ca5fa87cd443c..84bb855fea5fa397ff71e2c922fea3302951b7ca 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
int64_t predicate_dict_len = 3162;
int64_t mark_dict_len = 2;
- SetupLoDTensor(word,
- lod,
- static_cast(0),
+ SetupLoDTensor(&word, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(predicate,
- lod,
- static_cast(0),
+ SetupLoDTensor(&predicate, lod, static_cast(0),
static_cast(predicate_dict_len - 1));
- SetupLoDTensor(ctx_n2,
- lod,
- static_cast(0),
+ SetupLoDTensor(&ctx_n2, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(ctx_n1,
- lod,
- static_cast(0),
+ SetupLoDTensor(&ctx_n1, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(ctx_0,
- lod,
- static_cast(0),
+ SetupLoDTensor(&ctx_0, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(ctx_p1,
- lod,
- static_cast(0),
+ SetupLoDTensor(&ctx_p1, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(ctx_p2,
- lod,
- static_cast(0),
+ SetupLoDTensor(&ctx_p2, lod, static_cast(0),
static_cast(word_dict_len - 1));
- SetupLoDTensor(mark,
- lod,
- static_cast(0),
+ SetupLoDTensor(&mark, lod, static_cast(0),
static_cast(mark_dict_len - 1));
std::vector cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 1fb0f9e77797cf6e61e918700763ee33a495cb96..f12828a2685305c20d26492dbf04fa9ddacf9317 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
- SetupTensor(input,
- {FLAGS_batch_size, 1, 28, 28},
- static_cast(-1),
- static_cast(1));
+ SetupTensor(&input, {FLAGS_batch_size, 1, 28, 28},
+ static_cast(-1), static_cast(1));
std::vector cpu_feeds;
cpu_feeds.push_back(&input);
@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
- TestInference(
- dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+ TestInference(dirname, cpu_feeds, cpu_fetchs1,
+ FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
- TestInference(
- dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+ TestInference(dirname, cpu_feeds, cpu_fetchs2,
+ FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
CheckError(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
index b42a33c9a90b5feafaed343a197da0e4db11b7ea..70aa6b194d4417fc85384cc3f615089f024f928e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
// Use the first data from paddle.dataset.movielens.test() as input
std::vector user_id_data = {1};
- SetupTensor(user_id, {batch_size, 1}, user_id_data);
+ SetupTensor(&user_id, {batch_size, 1}, user_id_data);
std::vector gender_id_data = {1};
- SetupTensor(gender_id, {batch_size, 1}, gender_id_data);
+ SetupTensor(&gender_id, {batch_size, 1}, gender_id_data);
std::vector age_id_data = {0};
- SetupTensor(age_id, {batch_size, 1}, age_id_data);
+ SetupTensor(&age_id, {batch_size, 1}, age_id_data);
std::vector job_id_data = {10};
- SetupTensor(job_id, {batch_size, 1}, job_id_data);
+ SetupTensor(&job_id, {batch_size, 1}, job_id_data);
std::vector movie_id_data = {783};
- SetupTensor(movie_id, {batch_size, 1}, movie_id_data);
+ SetupTensor(&movie_id, {batch_size, 1}, movie_id_data);
std::vector category_id_data = {10, 8, 9};
- SetupLoDTensor(category_id, {3, 1}, {{0, 3}}, category_id_data);
+ SetupLoDTensor(&category_id, {3, 1}, {{0, 3}}, category_id_data);
std::vector movie_title_data = {1069, 4140, 2923, 710, 988};
- SetupLoDTensor(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+ SetupLoDTensor(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
std::vector cpu_feeds;
cpu_feeds.push_back(&user_id);
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
index a0523905bd1631cd8768b1601e459cb9d110a84d..e15c3f59acb1eac535120554a3799c37e9d4e951 100644
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
paddle::framework::LoDTensor word_data, trg_word;
paddle::framework::LoD lod{{0, 4, 10}};
- SetupLoDTensor(
- word_data, lod, static_cast(0), static_cast(1));
- SetupLoDTensor(
- trg_word, lod, static_cast(0), static_cast(1));
+ SetupLoDTensor(&word_data, lod, static_cast(0),
+ static_cast(1));
+ SetupLoDTensor(&trg_word, lod, static_cast(0),
+ static_cast(1));
std::vector cpu_feeds;
cpu_feeds.push_back(&word_data);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index 824b3274ebc7ba046e61798b3f61ef9924a75679..0dbb6a30405eb64133613052ad57b1f705a9e7b4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
paddle::framework::LoD lod{{0, 4, 10}};
int64_t word_dict_len = 5147;
- SetupLoDTensor(words,
- lod,
- static_cast(0),
+ SetupLoDTensor(&words, lod, static_cast(0),
static_cast(word_dict_len - 1));
std::vector cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index 1481760c529c29a7290f476e2a22e1ded5ab7787..c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include
#include "gflags/gflags.h"
+#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
paddle::framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary
- SetupLoDTensor(first_word, lod, static_cast(0), dict_size - 1);
- SetupLoDTensor(second_word, lod, static_cast(0), dict_size - 1);
- SetupLoDTensor(third_word, lod, static_cast(0), dict_size - 1);
- SetupLoDTensor(fourth_word, lod, static_cast(0), dict_size - 1);
+ SetupLoDTensor(&first_word, lod, static_cast(0), dict_size - 1);
+ SetupLoDTensor(&second_word, lod, static_cast(0), dict_size - 1);
+ SetupLoDTensor(&third_word, lod, static_cast(0), dict_size - 1);
+ SetupLoDTensor(&fourth_word, lod, static_cast(0), dict_size - 1);
std::vector cpu_feeds;
cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dce541c0971a6ff9a3728e915fe8c3d009c23550..5118e66f1e7a36333ff4425361e54ec59e6ba05b 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
+#pragma once
+
+#include