提交 38ba7e5a 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_reader

......@@ -25,12 +25,3 @@ third_party/
# clion workspace.
cmake-build-*
# generated while compiling
paddle/pybind/pybind.h
CMakeFiles
cmake_install.cmake
paddle/.timestamp
python/paddlepaddle.egg-info/
paddle/fluid/pybind/pybind.h
python/paddle/version.py
......@@ -36,11 +36,41 @@
- Trainer Count: 100
- Metrics: mini-batch / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure the Performance for Different PServer Count
......@@ -48,11 +78,41 @@
- Batch Size: 64
- Metrics: mini-batch / sec
| PServer Count | 10 | 20 | 40 | 60 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - |
| TensorFlow | - | - | - | - |
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>10</th>
<th>20</th>
<th>40 </th>
<th>60</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
### Measure Parallel Efficiency By Increasing Trainer Count
......@@ -67,11 +127,69 @@ The parallel efficiency is:
$E = \div(S, N)$
| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
<table>
<thead>
<tr>
<th>Trainer Counter </th>
<th>1</th>
<th>10</th>
<th>20 </th>
<th>30</th>
<th>40</th>
<th>50</th>
<th>60 </th>
<th>70</th>
<th>80</th>
<th>90</th>
<th>100 </th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
<tr>
<td>TensorFlow </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
<td>- </td>
<td>-</td>
<td>- </td>
<td>- </td>
</tr>
</tbody>
</table>
## Reproduce the benchmark
......
......@@ -16,11 +16,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 15.44 </td>
<td> 16.32 </td>
<td> 16.74 </td>
<td> 16.79 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 15.97 </td>
<td> 17.04 </td>
<td> 17.60 </td>
<td> 17.83 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> 9.09 </td>
<td> 9.10 </td>
<td> 9.24 </td>
<td> 8.66 </td>
</tr>
</tbody>
</table>
### Different Batch Size
......@@ -28,12 +58,40 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Trainer Count: 20
- Metrics: samples / sec
| Batch Size | 32 | 64 | 128 | 256 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
| TensorFlow | - | - | - | - |
<table>
<thead>
<tr>
<th>Batch Size </th>
<th> 32</th>
<th>64</th>
<th>128 </th>
<th>256</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 190.20 </td>
<td> 222.15 </td>
<td> 247.40 </td>
<td> 258.18 </td>
</tr>
<tr>
<td>PaddlePaddle v2 </td>
<td> 170.96 </td>
<td> 233.71 </td>
<td> 256.14 </td>
<td> 329.23 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Accelerate Rate
......@@ -41,11 +99,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Batch Size: 128
- Metrics: samples / sec
| Trainer Count | 20 | 40 | 80 | 100 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
| TensorFlow | - | - | - | - |
<table>
<thead>
<tr>
<th>Trainer Count </th>
<th>20</th>
<th>40</th>
<th>80</th>
<th>100</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid</td>
<td> 263.29 (78.64%) </td>
<td> 518.80 (77.47%) </td>
<td> 836.26 (62.44%) </td>
<td> 1019.29 (60.89%) </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 326.85 (92.85%) </td>
<td> 534.58 (75.93%) </td>
<td> 853.30 (60.60%) </td>
<td> 1041.99 (59.20%) </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
### Different Pserver Count
......@@ -53,11 +141,41 @@ Setting environment variable: `MKL_NUM_THREADS=1`.
- Batch Size: 128
- Metrics: samples/ sec
| PServer Count | 3 | 6 |10 | 20 |
| -- | -- | -- | -- | -- |
| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
| TensorFlow | - | - | - | - |
<table>
<thead>
<tr>
<th>PServer Count </th>
<th>3</th>
<th>6</th>
<th>10</th>
<th>20</th>
</tr>
</thead>
<tbody>
<tr>
<td> PaddlePaddle Fluid(should fix in next PR) </td>
<td> 589.1 </td>
<td> 592.6 </td>
<td> 656.4 </td>
<td> 655.8 </td>
</tr>
<tr>
<td>PaddlePaddle v2 (need more tests) </td>
<td> 593.4 </td>
<td> 791.3 </td>
<td> 729.7 </td>
<td> 821.7 </td>
</tr>
<tr>
<td>TensorFlow </td>
<td> - </td>
<td> - </td>
<td> - </td>
<td> - </td>
</tr>
</tbody>
</table>
*The performance gap between Fuild and v2 comes from the network interference.*
......
......@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
......
......@@ -54,5 +54,7 @@ add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
add_dependencies(snappystream extern_snappystream)
......@@ -62,7 +62,8 @@ ExternalProject_Add(
)
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
......
......@@ -25,7 +25,8 @@ ELSE(WIN32)
SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
ExternalProject_Add(
extern_zlib
......
......@@ -251,7 +251,7 @@ function(cc_test TARGET_NAME)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endfunction(cc_test)
......@@ -561,9 +561,9 @@ function(py_test TARGET_NAME)
set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endfunction()
......
......@@ -27,7 +27,7 @@ sphinx_add_target(paddle_fluid_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_docs gen_proto_py)
add_dependencies(paddle_fluid_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
......@@ -50,6 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_fluid_docs_cn gen_proto_py)
add_dependencies(paddle_fluid_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
......@@ -19,4 +19,4 @@ sphinx_add_target(paddle_fluid_apis
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind)
add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
......@@ -494,6 +494,12 @@ reshape
.. autofunction:: paddle.fluid.layers.reshape
:noindex:
pad
---
.. autofunction:: paddle.fluid.layers.pad
:noindex:
scale
-----
......
......@@ -5,9 +5,11 @@ In a large scale machine learning setup where the size of the training data is h
Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
<img src="./images/asgd.gif" align="center"/><br/>
<p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
</p>
We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
......
......@@ -6,11 +6,33 @@ Here are some initial thoughts. Your comments are welcome!
I think we need only the following few CMake functions to make a project description mean and clean:
| C++ | CUDA C++ | Go |
|---|---|---|
| cc_library | nv_library | go_library |
| cc_binary | nv_binary | go_binary |
| cc_test | nv_test | go_test |
<table>
<thead>
<tr>
<th>C++</th>
<th>CUDA C++</th>
<th>Go</th>
</tr>
</thead>
<tbody>
<tr>
<td>cc_library </td>
<td>nv_library </td>
<td>go_library </td>
</tr>
<tr>
<td>cc_binary </td>
<td>nv_binary </td>
<td>go_binary </td>
</tr>
<tr>
<td> cc_test </td>
<td> nv_test </td>
<td> go_test </td>
</tr>
</tbody>
</table>
- The `_library` functions generate .a files from source code.
- The `_binary` functions generate executable binary files.
......
......@@ -14,11 +14,29 @@ In programming languages, a block is a pair of curly braces that includes local
Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
| programming languages | PaddlePaddle |
|-----------------------|-----------------------|
| for, while loop | RNN, WhileOp |
| if, if-else, switch | IfElseOp, SwitchOp |
| sequential execution | a sequence of layers |
<table>
<thead>
<tr>
<th>programming languages</th>
<th>PaddlePaddle</th>
</tr>
</thead>
<tbody>
<tr>
<td>for, while loop </td>
<td>RNN, WhileOp </td>
</tr>
<tr>
<td>if, if-else, switch </td>
<td>IfElseOp, SwitchOp </td>
</tr>
<tr>
<td>sequential execution </td>
<td>a sequence of layers </td>
</tr>
</tbody>
</table>
A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
......@@ -26,12 +44,33 @@ A key difference is that a C++ program describes a one pass computation, whereas
The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
| programming languages | PaddlePaddle |
|-----------------------|---------------------------------|
| stack | scope hierarchy |
| stack frame | scope |
| push at entering block| push at entering block |
| pop at leaving block | destroy when minibatch completes|
<table>
<thead>
<tr>
<th>programming languages</th>
<th>PaddlePaddle</th>
</tr>
</thead>
<tbody>
<tr>
<td>stack </td>
<td>scope hierarchy </td>
</tr>
<tr>
<td>stack frame </td>
<td>scope </td>
</tr>
<tr>
<td>push at entering block </td>
<td>push at entering block </td>
</tr>
<tr>
<td>pop at leaving block </td>
<td>destroy when minibatch completes </td>
</tr>
</tbody>
</table>
1. In traditional programs:
......
......@@ -86,12 +86,40 @@ def layer.fc(X):
We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example:
| C++ functions/functors | mul | add | | |
|------------------------|--------------|--------------|-------------|----------|
| C++ operator class | mulOp | addOp | FCOp | |
| Python binding | operator.mul | operator.add | operator.fc | |
| Python function | | | | layer.fc |
<table>
<thead>
<tr>
<th>C++ functions/functors</th>
<th>mul</th>
<th>add</th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>C++ operator class </td>
<td>mulOp</td>
<td>addOp </td>
<td>FCOp </td>
<td></td>
</tr>
<tr>
<td>Python binding </td>
<td>operator.mul</td>
<td> operator.add </td>
<td>operator.fc </td>
<td></td>
</tr>
<tr>
<td>Python function </td>
<td></td>
<td></td>
<td> </td>
<td>layer.fc</td>
</tr>
</tbody>
</table>
This is how we differentiate layer and operators in PaddlePaddle:
......
......@@ -2,12 +2,38 @@
Like other deep learning systems, PaddlePaddle supports training models from sequence data. Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor. What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
| | TensorFlow | PaddlePaddle |
|-----------------------|------------|--------------|
| RNN | Support | Support |
| recursive RNN | Support | Support |
| padding zeros | Must | No need |
| blob data type | Tensor | LoDTensor |
<table>
<thead>
<tr>
<th></th>
<th>TensorFlow</th>
<th>PaddlePaddle</th>
</tr>
</thead>
<tbody>
<tr>
<td>RNN </td>
<td>Support </td>
<td>Support </td>
</tr>
<tr>
<td>recursive RNN </td>
<td>Support </td>
<td>Support </td>
</tr>
<tr>
<td>padding zeros </td>
<td> Must </td>
<td>No need </td>
</tr>
<tr>
<td> blob data type </td>
<td> Tensor</td>
<td> LoDTensor </td>
</tr>
</tbody>
</table>
PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators. The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences. This document presents the design of LoD and LoDTensor.
......
......@@ -10,10 +10,27 @@ PaddlePaddle uses proto message to describe compile time program because :
The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`) and `Operations`. The concept to represent them is in the table below.
| |compile time|runtime|
|---|---|---|
|Data|VarDesc(proto)|Variable(cpp)|
|Operation|OpDesc(proto)|Operator(cpp)|
<table>
<thead>
<tr>
<th></th>
<th>compile time</th>
<th>runtime</th>
</tr>
</thead>
<tbody>
<tr>
<td>Data </td>
<td>VarDesc(proto) </td>
<td>Variable(cpp) </td>
</tr>
<tr>
<td>Operation </td>
<td>OpDesc(proto) </td>
<td>Operator(cpp) </td>
</tr>
</tbody>
</table>
## Definition of VarType
......
......@@ -114,13 +114,13 @@ current thread under two conditions:
#### Channel Send
<p align="center">
<img src="./images/channel_send.png"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
</p>
#### Channel Receive
<p align="center">
<img src="./images/channel_recv.png"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
</p>
## Limitations and Considerations
......
......@@ -10,12 +10,42 @@ The answer relies on the fact that a `ProgramDesc` is similar to an abstract syn
The following table compares concepts in Fluid and Go
| Go | Fluid |
|----|-------|
|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
<table>
<thead>
<tr>
<th></th>
<th>Go</th>
<th>Fluid</th>
</tr>
</thead>
<tbody>
<tr>
<td>user-defined functions </td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
<td></td>
</tr>
<tr>
<td>control-flow and built-in functions </td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators">intrinsics/operators</a></td>
<td></td>
</tr>
<tr>
<td>goroutines, channels </td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h">class ThreadPool</a></td>
<td></td>
</tr>
<tr>
<td>runtime </td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h">class Executor</a></td>
<td></td>
</tr>
</tbody>
</table>
## An Example Concurrent Program
......
......@@ -13,14 +13,41 @@ Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously exe
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
<table>
<thead>
<tr>
<th>concurrent programming model</th>
<th>implementation</th>
</tr>
</thead>
<tbody>
<tr>
<td>mutex </td>
<td>types and functions in standard libraries </td>
</tr>
<tr>
<td>semaphore </td>
<td> types and functions in standard libraries </td>
</tr>
<tr>
<td> communicating sequential processes (CSP) </td>
<td> Go programming language </td>
</tr>
<tr>
<td> actor model </td>
<td> Erlang programming language </td>
</tr>
<tr>
<td> message passing </td>
<td> MPI </td>
</tr>
<tr>
<td> bulk synchronous parallel (BSP) </td>
<td> Pregel distributed programming framework </td>
</tr>
</tbody>
</table>
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
......
......@@ -254,7 +254,7 @@ only one case will be executed.
### select_op flow
<p align="center">
<img src="./images/select_op_workflow.png"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
</p>
The select algorithm is inspired by golang's select routine. Please refer to
......
......@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
<img src="src/compiler.png"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
<img src="src/paddle-compile.png"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
......@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
<img src="src/distributed_architecture.png"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
......@@ -152,7 +152,7 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on
Cluster environment.
<img src="src/remote_executor.png" width="500" align="center" />
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
......@@ -171,7 +171,7 @@ In the future, a more general placement algorithm should be implemented, which m
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
<img src="src/local_architecture.png"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
### Training Data
......
......@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
## Transpiler
<img src="src/multi-threads/single-thread@3x.png" width="300">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
After converted:
<img src="src/multi-threads/multi-threads@3x.png" width="1000">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
## Implement
......
......@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
Below is an example of converting the user defined graph to the
subgraphs for the trainer and the parameter server:
<img src="src/local-graph.png" width="300"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
After converting:
<img src="src/dist-graph.png" width="700"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program.
......@@ -69,8 +69,7 @@ In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list o
non-zero gradient data. So when we do parameter optimization both locally and remotely,
we only need to send those non-zero rows to the optimizer operators:
<img src="src/sparse_update.png" width="700" />
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
### Benefits
- Model parallelism becomes easier to implement: it is an extension to
......
......@@ -5,7 +5,7 @@ This document describes the RNN (Recurrent Neural Network) operator and how it i
## RNN Algorithm Implementation
<p align="center">
<img src="./rnn.jpg"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
</p>
The above diagram shows an RNN unrolled into a full network.
......@@ -22,7 +22,7 @@ There are several important concepts here:
There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
<p align="center">
<img src="./rnn.png"/><br/>
<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn.png"/><br/>
Figure 2 illustrates the RNN's data flow
</p>
......@@ -93,7 +93,7 @@ For example, we could have a 2-level RNN, where the top level corresponds to par
The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
<p align="center">
<img src="./2_level_rnn.png"/>
<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/2_level_rnn.png"/>
</p>
```python
......@@ -149,5 +149,5 @@ If the `output_all_steps` is set to False, it will only output the final time st
<p align="center">
<img src="./rnn_2level_data.png"/>
<img src="https://github.com/PaddlePaddle/Paddle/tree/develop/doc/fluid/images/rnn_2level_data.png"/>
</p>
......@@ -66,7 +66,7 @@ As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attribu
The following graph showes the training computational process of `batch_norm_op`:
<img src="../images/batch_norm_op_kernel.png" width="800"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
......@@ -124,7 +124,7 @@ for pass_id in range(PASS_NUM):
`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
<div align=center>
<img src="../images/batch_norm_fork.png" width="500"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
</div>
Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
......
......@@ -2,12 +2,33 @@
Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
| Python classes | Protobuf messages |
| --- | --- |
| Program | ProgramDesc |
| Block | BlockDesc |
| Operator | OpDesc |
| Variable | VarDesc |
<table>
<thead>
<tr>
<th>Python classes</th>
<th>Protobuf messages</th>
</tr>
</thead>
<tbody>
<tr>
<td>Program </td>
<td>ProgramDesc </td>
</tr>
<tr>
<td>Block </td>
<td>BlockDesc </td>
</tr>
<tr>
<td>Operator </td>
<td>OpDesc </td>
</tr>
<tr>
<td>Variable </td>
<td>VarDesc </td>
</tr>
</tbody>
</table>
Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
......
......@@ -6,17 +6,17 @@ A central problem in machine learning is how to design an algorithm that will pe
### Parameter Norm Penalties
Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
<img src="./images/loss_equation.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
##### L2 Regularization:
<img src="./images/l2_regularization.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
##### L1 Regularization
<img src="./images/l1_regularization.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
......@@ -40,11 +40,11 @@ The idea of building ops for regularization is in sync with the refactored Paddl
Below is an example of a really simple feed forward neural network.
<img src="./images/feed_forward.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
<img src="./images/feed_forward_regularized.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
   
### Python API implementation for Regularization
......@@ -64,9 +64,3 @@ Since we want to create the regularization ops in a lazy manner, the regularizat
#### High-level API
In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
......@@ -10,11 +10,37 @@ Fluid is the answer. Fluid is similar to PyTorch and TensorFlow Eager Execution
Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
| Existed since | model as sequence of layers | model as graph of operators | No model |
|--|--|--|--|
| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
<table>
<thead>
<tr>
<th>Existed since</th>
<th>model as sequence of layers</th>
<th>model as graph of operators</th>
<th>No model</th>
</tr>
</thead>
<tbody>
<tr>
<td>2013 </td>
<td>Caffe, Theano, Torch, PaddlePaddle </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>2015 </td>
<td> </td>
<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
<td> </td>
</tr>
<tr>
<td>2016 </td>
<td> </td>
<td> </td>
<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
</tr>
</tbody>
</table>
From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model. To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
......
......@@ -36,11 +36,37 @@ At compile time, the Python program generates a protobuf message representation
At runtime, the C++ program realizes the graph and runs it.
| | Representation (protobuf messages) | Realization (C++ class objects) |
|---|---|---|
|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
|Block|BlockDesc|Block|
<table>
<thead>
<tr>
<th></th>
<th>Representation (protobuf messages)</th>
<th>Realization (C++ class objects) </th>
</tr>
</thead>
<tbody>
<tr>
<td>Data</td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107">VarDesc</a></td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24">Variable</a></td>
</tr>
<tr>
<td>Operation </td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35">OpDesc</a></td>
<td>
<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64">Operator</a></td>
</tr>
<tr>
<td>Block </td>
<td>BlockDesc </td>
<td>Block </td>
</tbody>
</table>
The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
......
......@@ -68,11 +68,33 @@ We roughly break down the project into 14 tasks:
Tasks parallelizable within phases:
Roadmap | Description | Parallelizable Tasks
----------- | :------------------------------------ | :--------------------
Phase I | Simplified model & components | *Task 1* ~ *Task 8*
Phase II | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
Phase III | Documentations | *Task13* ~ *Task14*
<table>
<thead>
<tr>
<th>Roadmap</th>
<th>Description</th>
<th> Parallelizable Tasks</th>
</tr>
</thead>
<tbody>
<tr>
<td>Phase I </td>
<td>Simplified model & components </td>
<td>Task 1 ~ Task 8</td>
</tr>
<tr>
<td>Phase II </td>
<td> Standard model & benchmarking & profiling</td>
<td>Task 9 ~ Task 12 </td>
</tr>
<tr>
<td>Phase III </td>
<td> Documentations</td>
<td> Task13 ~ Task14 </td>
</tr>
</tbody>
</table>
Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
......@@ -94,7 +116,7 @@ The classical DS2 network contains 15 layers (from bottom to top):
- **One** CTC-loss layer
<div align="center">
<img src="images/ds2_network.png" width=350><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
Figure 1. Archetecture of Deep Speech 2 Network.
</div>
......@@ -121,18 +143,63 @@ Key ingredients about the layers:
- Added to all above layers (except for data and loss layer).
- Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
<table>
<thead>
<tr>
<th>Required Components</th>
<th> PaddlePaddle Support</th>
<th> Need to Develop</th>
</tr>
</thead>
<tbody>
<tr>
<td>Data Layer I (Spectrogram) </td>
<td>Not supported yet.</td>
<td>TBD (Task 3)</td>
</tr>
<tr>
<td>Data Layer II (Transcription) </td>
<td> paddle.data_type.integer_value_sequence</td>
<td> - </td>
</tr>
<tr>
<td>2D Convolution Layer </td>
<td> paddle.layer.image_conv_layer</td>
<td> - </td>
</tr>
<tr>
<td>DataType Converter (vec2seq)</td>
<td> paddle.layer.block_expand</td>
<td> - </td>
</tr>
<tr>
<td>Bi-/Uni-directional RNNs </td>
<td>paddle.layer.recurrent_group</td>
<td> - </td>
</tr>
<tr>
<td>Row Convolution Layer </td>
<td>Not supported yet.</td>
<td>TBD (Task 4)</td>
</tr>
<tr>
<td>CTC-loss Layer </td>
<td>paddle.layer.warp_ctc</td>
<td> - </td>
</tr>
<tr>
<td>Batch Normalization Layer </td>
<td>paddle.layer.batch_norm</td>
<td> - </td>
</tr>
<tr>
<td>CTC-Beam search </td>
<td>Not supported yet.</td>
<td> TBD (Task 6) </td>
</tr>
</tbody>
</table>
Required Components | PaddlePaddle Support | Need to Develop
:------------------------------------- | :-------------------------------------- | :-----------------------
Data Layer I (Spectrogram) | Not supported yet. | TBD (Task 3)
Data Layer II (Transcription) | `paddle.data_type.integer_value_sequence` | -
2D Convolution Layer | `paddle.layer.image_conv_layer` | -
DataType Converter (vec2seq) | `paddle.layer.block_expand` | -
Bi-/Uni-directional RNNs | `paddle.layer.recurrent_group` | -
Row Convolution Layer | Not supported yet. | TBD (Task 4)
CTC-loss Layer | `paddle.layer.warp_ctc` | -
Batch Normalization Layer | `paddle.layer.batch_norm` | -
CTC-Beam search | Not supported yet. | TBD (Task 6)
### Row Convolution
......@@ -141,7 +208,7 @@ TODO by Assignees
### Beam Search with CTC and LM
<div align="center">
<img src="images/beam_search.png" width=600><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
......
......@@ -199,7 +199,7 @@ Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail i
## LoD and shape changes during decoding
<p align="center">
<img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
</p>
According to the image above, the only phase that changes the LoD is beam search.
......
......@@ -7,14 +7,14 @@ It applies several important concepts in machine learning system design, includi
In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
<p align="center">
<img src="./test.dot.png" width = "35%" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
</p>
The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
<p align="center">
<img src="./dcgan.png" width = "90%" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
Figure 2. Photo borrowed from the original DC-GAN paper.
</p>
......
......@@ -4,9 +4,9 @@
.. toctree::
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
use_eigen_en.md
new_op_cn.md
new_op_kernel.md
use_eigen_cn.md
name_convention.md
support_new_device.md
releasing_process.md
......
......@@ -5,7 +5,7 @@ Development
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
new_op_kernel.md
use_eigen_en.md
name_convention.md
support_new_device.md
......
......@@ -26,13 +26,32 @@
依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
内容 | 定义位置
-------------- | :----------------------
OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake
Op定义 | `.cc`文件
Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
<table>
<thead>
<tr>
<th>内容</th>
<th>定义位置</th>
</tr>
</thead>
<tbody>
<tr>
<td>OpProtoMake定义 </td>
<td>`.cc`文件,Backward Op不需要定义OpProtoMake </td>
</tr>
<tr>
<td>Op定义 </td>
<td> `.cc`文件</td>
</tr>
<tr>
<td>Kernel实现 </td>
<td> CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。</td>
</tr>
<tr>
<td>注册Op </td>
<td> Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中</td>
</tr>
</tbody>
</table>
实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc``*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
......
......@@ -33,6 +33,33 @@ Op definition | `.cc` files
Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
<table>
<thead>
<tr>
<th>Information</th>
<th> Where is it defined</th>
</tr>
</thead>
<tbody>
<tr>
<td>OpProtoMake definition </td>
<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
</tr>
<tr>
<td>Op definition </td>
<td> `.cc` files</td>
</tr>
<tr>
<td>Kernel implementation </td>
<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
</tr>
<tr>
<td>Registering the Op </td>
<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
</tr>
</tbody>
</table>
New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
......
......@@ -37,7 +37,7 @@ PaddlePaddle每次发新的版本,遵循以下流程:
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。然后按照上述的方法
使用`twine`工具上传即可。
<img src="ci_build_whl.png">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
......@@ -78,13 +78,116 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| API.V2 + Docker + GPU | | | | | | | | |
| API.V2 + Docker + CPU | | | | | | | | |
| `paddle_trainer` + Docker + GPU | | | | | | | | |
| `paddle_trainer` + Docker + CPU | | | | | | | | |
| API.V2 + Ubuntu + GPU | | | | | | | | |
| API.V2 + Ubuntu + CPU | | | | | | | | |
| `paddle_trainer` + Ubuntu + GPU | | | | | | | | |
| `paddle_trainer` + Ubuntu + CPU | | | | | | | | |
<table>
<thead>
<tr>
<th></th>
<th>新手入门章节 </th>
<th> 识别数字</th>
<th> 图像分类</th>
<th>词向量</th>
<th> 情感分析</th>
<th>语意角色标注</th>
<th> 机器翻译</th>
<th>个性化推荐</th>
</tr>
</thead>
<tbody>
<tr>
<td>API.V2 + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + GPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>`paddle_trainer` + Docker + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> API.V2 + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td>API.V2 + Ubuntu + CPU </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + GPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
<tr>
<td> `paddle_trainer` + Ubuntu + CPU</td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
<td> </td>
</tr>
</tbody>
</table>
......@@ -16,18 +16,58 @@ As a result, we design a particular format for tensor serialization. By default,
The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
|field name | type | description |
| --- | --- | --- |
| version | uint32_t | Version of saved file. Always 0 now. |
| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
| tensor desc | void* | TensorDesc protobuf binary message |
| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
| lod_level | uint64_t | Level of LoD |
| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
| data of lod[0] | uint64_t* | [Optional] lod[0].data() |
| ... | ... | ... |
<table>
<thead>
<tr>
<th>field name</th>
<th>type </th>
<th>description </th>
</tr>
</thead>
<tbody>
<tr>
<td> version</td>
<td> uint32_t </td>
<td> Version of saved file. Always 0 now.</td>
</tr>
<tr>
<td> tensor desc length </td>
<td> uint32_t </td>
<td> TensorDesc(Protobuf message) length in bytes. </td>
</tr>
<tr>
<td>tensor desc </td>
<td> void*</td>
<td> TensorDesc protobuf binary message </td>
</tr>
<tr>
<td> tensor data </td>
<td> void* </td>
<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
</tr>
<tr>
<td> lod_level</td>
<td> uint64_t </td>
<td> Level of LoD </td>
</tr>
<tr>
<td> length of lod[0] </td>
<td> uint64_t </td>
<td> [Optional] length of lod[0] in bytes. </td>
</tr>
<tr>
<td> data of lod[0] </td>
<td> uint64_t* </td>
<td> [Optional] lod[0].data() </td>
</tr>
<tr>
<td>... </td>
<td> ... </td>
<td> ... </td>
</tr>
</tbody>
</table>
## Summary
......
......@@ -99,13 +99,49 @@ for pass_id in range(100):
### 分布式训练脚本运行说明
分布式任务的运行需要将表格中说明的多个参数进行赋值:
| 参数名 | 值类型 | 说明 | 示例 |
|:-------------|:------|:---------------------------------------|:-------------|
| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 |
| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 |
| trainers | int | 训练节点的总个数,>0的数字 | 4 |
| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 |
| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER |
<table>
<thead>
<tr>
<th>参数名</th>
<th> 值类型</th>
<th>说明</th>
<th> 示例</th>
</tr>
</thead>
<tbody>
<tr>
<td>trainer_id </td>
<td> int</td>
<td> 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 </td>
<td> 0/1/2/3 </td>
</tr>
<tr>
<td>pservers </td>
<td> str</td>
<td> parameter server 列表 </td>
<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
</tr>
<tr>
<td>trainers </td>
<td>int </td>
<td> 训练节点的总个数,>0的数字 </td>
<td> 4 </td>
</tr>
<tr>
<td> server_endpoint</td>
<td> str </td>
<td> 当前所起的服务节点的IP:PORT </td>
<td> 127.0.0.1:8789 </td>
</tr>
<tr>
<td> training_role</td>
<td>str </td>
<td> 节点角色, TRAINER/PSERVER </td>
<td> PSERVER </td>
</tr>
</tbody>
</table>
**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下:
......
......@@ -42,14 +42,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
每一列的含义是:
| 列名 | 含义 |
| --- | --- |
| ncalls | 函数的调用次数 |
| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
| percall | tottime的每次调用平均时间 |
| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
| percall | cumtime的每次调用平均时间 |
| filename:lineno(function) | 文件名, 行号,函数名 |
<table>
<thead>
<tr>
<th>列名</th>
<th>含义 </th>
</tr>
</thead>
<tbody>
<tr>
<td> ncalls</td>
<td> 函数的调用次数</td>
</tr>
<tr>
<td>tottime</td>
<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
</tr>
<tr>
<td> percall </td>
<td> tottime的每次调用平均时间</td>
</tr>
<tr>
<td> cumtime</td>
<td> 函数总时间。包含这个函数调用其他函数的时间</td>
</tr>
<tr>
<td> percall</td>
<td> cumtime的每次调用平均时间</td>
</tr>
<tr>
<td> filename:lineno(function) </td>
<td> 文件名, 行号,函数名 </td>
</tr>
</tbody>
</table>
### 寻找性能瓶颈
......
......@@ -57,14 +57,40 @@ port, we will see the output like the following:
where each line corresponds to Python function, and the meaning of
each column is as follows:
| column | meaning |
| --- | --- |
| ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
| percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls |
| filename:lineno(function) | where the function is defined |
<table>
<thead>
<tr>
<th>column</th>
<th>meaning </th>
</tr>
</thead>
<tbody>
<tr>
<td> ncalls</td>
<td> the number of calls into a function</td>
</tr>
<tr>
<td>tottime</td>
<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
</tr>
<tr>
<td> percall </td>
<td> tottime divided by ncalls</td>
</tr>
<tr>
<td> cumtime</td>
<td> the total execution time of the function, including the execution time of other functions being called</td>
</tr>
<tr>
<td> percall</td>
<td> cumtime divided by ncalls</td>
</tr>
<tr>
<td> filename:lineno(function) </td>
<td> where the function is define </td>
</tr>
</tbody>
</table>
### Identify Performance Bottlenecks
......
......@@ -23,7 +23,7 @@ But how to record the time for the mixed C++ and CUDA program? There many C++ A
The overall flow is shown as the following figure.
<img src="./images/profiler.png" align="center"/><br/>
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
### Event
......
digraph G {
rnn [label="1st level RNN" shape=box]
subgraph cluster0 {
label = "time step 0"
sent0 [label="sentence"]
sent1 [label="sentence"]
rnn1 [label="2nd level RNN" shape=box]
sent0 -> rnn1
sent1 -> rnn1
}
subgraph cluster1 {
label = "time step 1"
sent2 [label="sentence"]
sent3 [label="sentence"]
rnn2 [label="2nd level RNN" shape=box]
sent2 -> rnn2
sent3 -> rnn2
}
subgraph cluster2 {
label = "time step 2"
sent4 [label="sentence"]
sent5 [label="sentence"]
rnn3 [label="2nd level RNN" shape=box]
sent4 -> rnn3
sent5 -> rnn3
}
para0 [label="paragraph info 0"]
para1 [label="paragraph info 1"]
para2 [label="paragraph info 2"]
rnn1 -> para0
rnn2 -> para1
rnn3 -> para2
para0 -> rnn
para1 -> rnn
para2 -> rnn
chapter [label="chapter info"]
rnn -> chapter
}
digraph ImageBatchNormForkGragh {
subgraph cluster_before {
Prev [label="...", shape=plaintext];
Rnn [label="rnn_op", shape=box];
BatchNorm [label="batch_norm_op", shape=box];
Fc [label="fc_op", shape=box];
After [label="...", shape=plaintext];
Prev -> Rnn -> BatchNorm -> Fc -> After;
label="original";
}
subgraph cluster_after {
Prev2 [label="...", shape=plaintext];
Rnn2 [label="rnn_op", shape=box];
BatchNorm2_1 [label="train_batch_norm_op", shape=box];
BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
Fc2_1 [label="fc_op", shape=box];
Fc2_2 [label="fc_op", shape=box];
After2_1 [label="...", shape=plaintext];
After2_2 [label="...", shape=plaintext];
Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
label="forked";
}
}
cat ./graph_construction_example.dot | \
sed 's/color=red/color=red, style=invis/g' | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_only.png
cat ./graph_construction_example.dot | \
sed 's/color=green/color=green, style=invis/g' | \
dot -Tpng > graph_construction_example_forward_backward.png
cat ./graph_construction_example.dot | \
dot -Tpng > graph_construction_example_all.png
digraph ImageClassificationGraph {
///////// The forward part /////////
FeedX [label="Feed", color=blue, shape=box];
FeedY [label="Feed", color=blue, shape=box];
InitW [label="Init", color=blue, shape=diamond];
Initb [label="Init", color=blue, shape=diamond];
FC [label="FC", color=blue, shape=box];
MSE [label="MSE", color=blue, shape=box];
x [label="x", color=blue, shape=oval];
l [label="l", color=blue, shape=oval];
y [label="y", color=blue, shape=oval];
W [label="W", color=blue, shape=doublecircle];
b [label="b", color=blue, shape=doublecircle];
cost [label="cost", color=blue, shape=oval];
FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
FeedY -> l [color=blue];
InitW -> W [color=blue];
Initb -> b [color=blue];
W -> FC [color=blue];
b -> FC [color=blue];
l -> MSE [color=blue];
////////// The backward part /////////
MSE_Grad [label="MSE_grad", color=red, shape=box];
FC_Grad [label="FC_grad", color=red, shape=box];
d_cost [label="d cost", color=red, shape=oval];
d_y [label="d y", color=red, shape=oval];
d_b [label="d b", color=red, shape=oval];
d_W [label="d W", color=red, shape=oval];
cost -> MSE_Grad [color=red];
d_cost -> MSE_Grad [color=red];
l -> MSE_Grad [color=red];
y -> MSE_Grad -> d_y [color=red];
x -> FC_Grad [color=red];
y -> FC_Grad [color=red];
d_y -> FC_Grad [color=red];
W -> FC_Grad -> d_W [color=red];
b -> FC_Grad -> d_b [color=red];
////////// The optimizaiton part //////////
OPT_W [label="SGD", color=green, shape=box];
OPT_b [label="SGD", color=green, shape=box];
W -> OPT_W [color=green];
b -> OPT_b [color=green];
d_W -> OPT_W -> W [color=green];
d_b -> OPT_b -> b [color=green];
////////// Groupings //////////
subgraph clusterMSE {
style=invis;
MSE;
MSE_Grad;
}
subgraph clusterFC {
style=invis;
FC;
FC_Grad;
}
}
digraph G {
label = "simple RNN implementation"
ranksep=2;
//graph [nodesep=1, ranksep=1];
node[nodesep=1]
subgraph cluster0 {
label = "global scope"
rankdir = TB
W
boot_memory
input
output
}
subgraph cluster1 {
label = "step-scope 0"
rankdir = TB
memory0[label="memory"]
prememory0[label="pre-memory"]
step_input0[label="step input"]
step_output0[label="step output"]
}
subgraph cluster2 {
label = "step-scope 1"
rankdir = TB
memory1[label="memory"]
prememory1[label="pre-memory"]
step_input1[label="step input"]
step_output1[label="step output"]
}
subgraph cluster3 {
label = "step-scope 2"
rankdir = TB
memory2[label="memory"]
prememory2[label="pre-memory"]
step_input2[label="step input"]
step_output2[label="step output"]
}
stepnet [shape=box]
stepnet0 [shape=box, style=dashed]
stepnet1 [shape=box, style=dashed]
stepnet2 [shape=box, style=dashed]
edge[color=blue]
boot_memory -> prememory0 [label="init" color="blue"]
memory0 -> prememory1 [label="copy/reference" color="blue"]
memory1 -> prememory2 [label="copy/reference" color="blue"]
edge[color=black]
W -> stepnet0[constraint=false, style=dashed]
W -> stepnet1[constraint=false, style=dashed]
W -> stepnet2[constraint=false, style=dashed]
memory0 -> stepnet0[style=dashed]
prememory0 -> stepnet0 -> step_output0[style=dashed]
memory1 -> stepnet1[style=dashed]
prememory1 -> stepnet1 -> step_output1[style=dashed]
memory2 -> stepnet2[style=dashed]
prememory2 -> stepnet2 -> step_output2[style=dashed]
input -> step_input0
input -> step_input1
input -> step_input2
step_input0 -> stepnet0 [style=dashed]
step_input1 -> stepnet1[style=dashed]
step_input2 -> stepnet2[style=dashed]
step_output0 -> output
step_output1 -> output
step_output2 -> output
stepnet0 -> stepnet[style=dashed]
stepnet1 -> stepnet[style=dashed]
stepnet2 -> stepnet[style=dashed]
}
digraph G {
chapter [label="chapter"]
subgraph cluster0 {
label = "paragraph 0"
top_rnn0[label="top rnn step 0" shape=box]
p0 [label="paragraph 0"]
p1 [label="paragraph 1"]
}
subgraph cluster1{
label = "paragraph 1"
top_rnn1[label="top rnn step 1" shape=box]
p2 [label="paragraph 0"]
p3 [label="paragraph 1"]
}
subgraph cluster_p0 {
label = "sentence 0"
low_rnn0 [label="low rnn step 0" shape=box]
s00 [label="sentence 0"]
s01 [label="sentence 1"]
low_rnn0 -> s00
low_rnn0 -> s01
}
subgraph cluster_p1 {
label = "sentence 1"
low_rnn1 [label="low rnn step 1" shape=box]
s10 [label="sentence 0"]
s11 [label="sentence 1"]
low_rnn1 -> s10
low_rnn1 -> s11
}
subgraph cluster_p2 {
label = "sentence 1"
low_rnn2 [label="low rnn step 0" shape=box]
s20 [label="sentence 0"]
s21 [label="sentence 1"]
low_rnn2 -> s20
low_rnn2 -> s21
}
subgraph cluster_p3 {
label = "sentence 1"
low_rnn3 [label="low rnn step 1" shape=box]
s30 [label="sentence 0"]
s31 [label="sentence 1"]
low_rnn3 -> s30
low_rnn3 -> s31
}
chapter -> top_rnn0
chapter -> top_rnn1
top_rnn0 -> p0
top_rnn0 -> p1
top_rnn1 -> p2
top_rnn1 -> p3
p0 -> low_rnn0
p1 -> low_rnn1
p2 -> low_rnn2
p3 -> low_rnn3
}
digraph Test {
z -> generator -> G_img;
G_img -> discriminator -> D_f -> d_loss_f;
label0 -> d_loss_f -> d_loss;
img -> discriminator -> D_t -> d_loss_t;
label1 -> d_loss_t -> d_loss;
d_loss -> d_loss_t[color=red, style=dashed];
d_loss -> d_loss_f[color=red, style=dashed];
d_loss_t -> D_t[color=red, style=dashed];
d_loss_f -> D_f[color=red, style=dashed];
D_t -> discriminator[color=red, style=dashed];
D_f -> discriminator[color=red, style=dashed];
D_f -> g_loss;
label2 -> g_loss;
g_loss -> D_f[color=green, style=dashed];
D_f -> discriminator[color=green, style=dashed];
discriminator -> G_img[color=green, style=dashed];
G_img -> generator[color=green, style=dashed];
discriminator [color=red, shape=box];
generator [color=green, shape=box];
z [shape=diamond];
img [shape=diamond];
label0 [shape=diamond];
label1 [shape=diamond];
label2 [shape=diamond];
d_loss [color=red];
g_loss [color=green];
}
......@@ -13,7 +13,7 @@
# serve to show the default.
import sys
import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
......
......@@ -13,7 +13,7 @@
# serve to show the default.
import sys
import os, subprocess
sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
import shlex
from recommonmark import parser, transform
import paddle
......
......@@ -27,7 +27,7 @@ sphinx_add_target(paddle_v2_docs
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_docs gen_proto_py)
add_dependencies(paddle_v2_docs gen_proto_py paddle_python)
# configured documentation tools and intermediate build results
set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
......@@ -50,6 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_CN})
add_dependencies(paddle_v2_docs_cn gen_proto_py)
add_dependencies(paddle_v2_docs_cn gen_proto_py paddle_python)
add_subdirectory(api)
......@@ -19,4 +19,4 @@ sphinx_add_target(paddle_v2_apis
${CMAKE_CURRENT_SOURCE_DIR}
${SPHINX_HTML_DIR_EN})
add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind)
add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
# Recurrent Group Tutorial
TBD
## Overview
Sequential data is common in natural language processing.
A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
## Related Concepts
### Basic Principle
`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
In PaddlePaddle, a simple call to `recurrent_group` is as follows:
``` python
recurrent_group(step, input, reverse)
```
- step: A callable function that defines the calculations completed by the RNN unit within a time step
- input: The input must be a single-layer sequence or a double-layer sequence
- reverse: Whether to process the input sequence in reverse order
The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
### Input
The input sequence processed by `recurrent_group` is mainly divided into the following three types:
- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
### Input Example
Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
### Output
The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
### Memory
Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
## Sequence-level RNN Introduction
`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
- Word-level RNN: each state corresponds to a word.
- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
## Usage of Sequence-level RNN
### Usage of Training Process
Using `recurrent_group` requires the following conventions:
- **Single-input Single-output**: Both input and output are single layer sequences.
- If there are multiple inputs, the number of words in different input sequences must be exactly equal.
- A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
- memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
- boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
- **Double-input Double-output**: Both input and output are two-level sequence.
- If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
- output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
- memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
- boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
### Usage of Generation Process
Using `beam_search` need follow those conventions:
- Word-level RNN: generate the next word from a word.
- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
......@@ -89,16 +89,17 @@ SWIG_LINK_LIBRARIES(swig_paddle
${START_END}
)
add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
COMMAND ${CMAKE_COMMAND} -E touch .timestamp
add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
DEPENDS _swig_paddle
)
# TODO(yuyang18) : make wheel name calculated by cmake
add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
if(WITH_TESTING)
IF(NOT PY_PIP_FOUND)
......
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
py_test(testTrain SRCS testTrain.py)
py_test(testMatrix SRCS testMatrix.py)
py_test(testVector SRCS testVector.py)
......
......@@ -370,4 +370,48 @@ extern void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t groups);
/**
* @brief Upsample forward.
* @param[in] inputData input data.
* @param[out] maskData the mask data from MaxPoolWithMaskLayer.
* @param[out] batchSize the batch size of the input.
* @param[in] imgSizeH image height.
* @param[in] imgSizeW image width.
* @param[in] channels the input channels.
* @param[in] outputH the output height.
* @param[in] outputW the output widht.
* @param[out] outputData output data.
*/
extern void hl_upsample_forward(real* inputData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* outputData);
/**
* @brief Upsample backward.
* @param[in] outputGradData the output grad data.
* @param[out] maskData the mask data from MaxPoolWithMaskLayer.
* @param[out] batchSize the batch size of the input.
* @param[in] imgSizeH image height.
* @param[in] imgSizeW image width.
* @param[in] channels the input channels.
* @param[in] outputH the output height.
* @param[in] outputW the output widht.
* @param[out] inputGradData the input grad data.
*/
extern void hl_upsample_backward(real* outputGradData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* inputGradData);
#endif // HL_CNN_H_
......@@ -224,4 +224,24 @@ inline void hl_maxout_backward(real* inGrad,
size_t featLen,
size_t group) {}
inline void hl_upsample_forward(real* inputData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* outputData) {}
inline void hl_upsample_backward(real* outputGradData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* inputGradData) {}
#endif // HL_CNN_STUB_H_
......@@ -1028,3 +1028,79 @@ void hl_maxout_backward(real* inGrad,
num_kernels, inGrad, outGrad, idData, size, featLen, groups);
CHECK_SYNC("hl_maxout_backward failed");
}
__global__ void upsampleForwardCompute(real* input_data,
real* mask_data,
size_t nthreads,
size_t in_h,
size_t in_w,
size_t out_h,
size_t out_w,
real* output_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) {
int offset = index / (in_w * in_h) * out_h * out_w;
int upsample_idx = static_cast<int>(mask_data[index]);
output_data[offset + upsample_idx] = input_data[index];
}
}
__global__ void upsampleBackwardCompute(real* out_grad,
real* mask_data,
size_t nthreads,
size_t in_h,
size_t in_w,
size_t out_h,
size_t out_w,
real* input_grad) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) {
int offset = index / (in_w * in_h) * out_h * out_w;
int upsample_idx = static_cast<int>(mask_data[index]);
input_grad[index] = out_grad[offset + upsample_idx];
}
}
void hl_upsample_forward(real* inputData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* outputData) {
int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
int blocks = (num_kernels + 1024 - 1) / 1024;
upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
maskData,
num_kernels,
imgSizeH,
imgSizeW,
outputH,
outputW,
outputData);
CHECK_SYNC("hl_upsample_forward failed");
}
void hl_upsample_backward(real* outputGradData,
real* maskData,
size_t batchSize,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW,
real* inputGradData) {
int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
int blocks = (num_kernels + 1024 - 1) / 1024;
upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
maskData,
num_kernels,
imgSizeH,
imgSizeW,
outputH,
outputW,
inputGradData);
CHECK_SYNC("hl_upsample_backward failed");
}
......@@ -74,8 +74,8 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <deque>
#include <memory>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
......@@ -96,6 +97,8 @@ class BlockDesc {
*/
void RemoveOp(size_t s, size_t e);
void RemoveVar(const std::string &name) { vars_.erase(name); }
std::vector<OpDesc *> AllOps() const;
size_t OpSize() const { return ops_.size(); }
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <condition_variable>
#include <condition_variable> // NOLINT
#include <typeindex>
#include "paddle/fluid/platform/enforce.h"
......@@ -216,7 +216,8 @@ class ChannelHolder {
template <typename T>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
explicit PlaceholderImpl(size_t buffer_size)
: type_(std::type_index(typeid(T))) {
channel_.reset(MakeChannel<T>(buffer_size));
}
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <atomic>
#include <condition_variable>
#include <condition_variable> // NOLINT
#include <deque>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -38,7 +38,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
virtual void Unlock();
virtual bool IsClosed();
virtual void Close();
ChannelImpl(size_t);
explicit ChannelImpl(size_t);
virtual ~ChannelImpl();
virtual void AddToSendQ(const void *referrer, T *data,
......@@ -60,7 +60,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
const void *referrer; // TODO(thuan): figure out better way to do this
std::function<bool(ChannelAction)> callback;
QueueMessage(T *item)
explicit QueueMessage(T *item)
: data(item), cond(std::make_shared<std::condition_variable_any>()) {}
QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
......@@ -88,15 +88,15 @@ class ChannelImpl : public paddle::framework::Channel<T> {
}
std::shared_ptr<QueueMessage> get_first_message(
std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
while (!queue.empty()) {
std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
while (!queue->empty()) {
// Check whether this message was added by Select
// If this was added by Select then execute the callback
// to check if you can execute this message. The callback
// can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next.
std::shared_ptr<QueueMessage> m = queue.front();
queue.pop_front();
std::shared_ptr<QueueMessage> m = queue->front();
queue->pop_front();
if (m->callback == nullptr || m->callback(action)) return m;
}
return nullptr;
......@@ -147,7 +147,7 @@ void ChannelImpl<T>::Send(T *item) {
// to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m =
get_first_message(recvq, ChannelAction::SEND);
get_first_message(&recvq, ChannelAction::SEND);
if (m != nullptr) {
*(m->data) = std::move(*item);
......@@ -198,7 +198,7 @@ bool ChannelImpl<T>::Receive(T *item) {
// buffer and move front of send queue to the buffer
if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m =
get_first_message(sendq, ChannelAction::RECEIVE);
get_first_message(&sendq, ChannelAction::RECEIVE);
if (buf_.size() > 0) {
// Case 1 : Channel is Buffered
// Do Data transfer from front of buffer
......@@ -219,9 +219,10 @@ bool ChannelImpl<T>::Receive(T *item) {
if (m != nullptr) {
*item = std::move(*(m->data));
m->Notify();
} else
} else {
return recv_return(Receive(item));
}
}
return recv_return(true);
}
......
......@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/channel.h"
#include <chrono>
#include <thread>
#include <chrono> // NOLINT
#include <thread> // NOLINT
#include "gtest/gtest.h"
using paddle::framework::Channel;
......@@ -166,9 +166,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size)
if (i < buffer_size) {
ch->Send(&i); // should block after 10 iterations
else {
} else {
bool is_exception = false;
try {
ch->Send(&i);
......@@ -212,12 +212,12 @@ TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
}
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -230,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
......@@ -241,21 +241,21 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
......@@ -277,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
......@@ -294,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a buffered channel also unblocks
......@@ -409,13 +409,13 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
......@@ -438,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
......@@ -454,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successful sends
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
......@@ -473,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
}
// Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channel also unblocks
// any receivers waiting on the channel
void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -498,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
......@@ -679,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
}
void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -697,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
......@@ -708,21 +708,21 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
......@@ -744,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (!thread_ended[i]) ct++;
}
EXPECT_GE(ct, 4);
} else {
// If ch is UnBuffered, all the threads should be blocked.
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
......@@ -761,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that closing a channelholder unblocks
......@@ -813,13 +813,13 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
// This tests that destroying a channelholder unblocks
// any senders waiting for channel
void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
bool send_success[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
bool send_success[kNumThreads];
// Launches threads that try to write and are blocked because of no readers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
send_success[i] = false;
t[i] = std::thread(
......@@ -841,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (thread_ended[i] == false) ct++;
}
// Atleast 4 threads must be blocked
EXPECT_GE(ct, 4);
} else {
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
}
......@@ -857,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
if (send_success[i]) ct++;
}
......@@ -876,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
}
// Join all threads
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
// This tests that destroying a channelholder also unblocks
// any receivers waiting on the channel
void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
const size_t kNumThreads = 5;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to read and are blocked because of no writers
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -901,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], false);
}
// delete the channel
delete ch;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
......@@ -945,12 +945,12 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
// This tests that closing a channelholder many times.
void ChannelHolderManyTimesClose(ChannelHolder *ch) {
const int num_threads = 15;
std::thread t[num_threads];
bool thread_ended[num_threads];
const int kNumThreads = 15;
std::thread t[kNumThreads];
bool thread_ended[kNumThreads];
// Launches threads that try to send data to channel.
for (size_t i = 0; i < num_threads / 3; i++) {
for (size_t i = 0; i < kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *ended) {
......@@ -962,7 +962,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
}
// Launches threads that try to receive data to channel.
for (size_t i = num_threads / 3; i < 2 * num_threads / 3; i++) {
for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -976,7 +976,7 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
}
// Launches threads that try to close the channel.
for (size_t i = 2 * num_threads / 3; i < num_threads; i++) {
for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
thread_ended[i] = false;
t[i] = std::thread(
[&](bool *p) {
......@@ -991,13 +991,13 @@ void ChannelHolderManyTimesClose(ChannelHolder *ch) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
// Verify that all threads are unblocked
for (size_t i = 0; i < num_threads; i++) {
for (size_t i = 0; i < kNumThreads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
EXPECT_TRUE(ch->IsClosed());
// delete the channel
delete ch;
for (size_t i = 0; i < num_threads; i++) t[i].join();
for (size_t i = 0; i < kNumThreads; i++) t[i].join();
}
TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
......
......@@ -16,6 +16,6 @@ else()
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
......@@ -21,6 +21,9 @@
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
#include <string>
#include <vector>
namespace paddle {
namespace framework {
namespace details {
......@@ -55,6 +58,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
auto graph = new SSAGraph();
SSAGraph &result = *graph;
std::unordered_set<std::string> og_has_been_broadcast;
result.vars_.resize(places_.size());
bool is_forwarding = true;
......@@ -122,9 +126,15 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. But there are no
// other cases, for example, we need to adjust the gradient according to
// the input when we get the gradient, which is not considered at present.
for (auto &og : var_names) {
if (grad_names_.count(og) != 0) { // is param grad
if (grad_names_.count(og) != 0 &&
og_has_been_broadcast.count(og) == 0) { // is param grad
// Insert NCCL AllReduce Op
og_has_been_broadcast.insert(og);
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
......@@ -161,6 +171,11 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
*/
PolishGraphToSupportDataHazards(&result);
/*
* Only variables should be the leaves of graph.
*/
AddOutputToLeafOps(&result);
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
......
......@@ -76,7 +76,7 @@ void NCCLAllReduceOpHandle::RunImpl() {
}
}
std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; }
std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; }
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -14,6 +14,9 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
......@@ -34,6 +37,10 @@ struct NCCLAllReduceOpHandle : public OpHandleBase {
std::string Name() const override;
// Delay and buffer nccl_all_reduce together can significantly increase
// performance. Disable this feature by returning false.
bool IsMultiDeviceTransfer() override { return true; };
protected:
void RunImpl() override;
};
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -53,6 +55,10 @@ class OpHandleBase {
void AddOutput(VarHandleBase *out);
// If the Op involves data transfer of multiple devices that
// will likely block other computations.
virtual bool IsMultiDeviceTransfer() { return false; }
protected:
virtual void RunImpl() = 0;
};
......
......@@ -136,6 +136,17 @@ void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) {
sout << "}\n";
}
void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
for (auto &op : graph->ops_) {
if (!op->outputs_.empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle();
graph->dep_vars_.emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -14,13 +14,13 @@
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
#include <memory>
#include <string>
namespace paddle {
namespace framework {
namespace details {
......@@ -52,6 +52,8 @@ class SSAGraphBuilder {
const std::string &each_var_name,
const platform::Place &place, size_t place_offset);
static void AddOutputToLeafOps(SSAGraph *graph);
static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout);
};
} // namespace details
......
......@@ -23,22 +23,36 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
size_t num_threads, bool use_event,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph)
std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
: SSAGraphExecutor(std::move(graph)),
pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
local_scopes_(local_scopes),
places_(places),
fetch_ctxs_(places),
use_event_(use_event) {}
use_event_(use_event),
running_ops_(0),
allow_op_delay_(allow_op_delay) {}
void ThreadedSSAGraphExecutor::RunDelayedOps(
const std::unordered_set<OpHandleBase *> &delayed_ops) {
for (auto op : delayed_ops) {
op->Run(use_event_);
}
}
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
BlockingQueue<VarHandleBase *> ready_vars;
std::unordered_set<OpHandleBase *> ready_ops;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule
// together since we currently cannot overlap computation and memcpy streams.
// Should revisit it if overlapping is available.
std::unordered_set<OpHandleBase *> delayed_ops;
std::unordered_set<OpHandleBase *> blocked_by_delayed_ops;
std::unordered_set<VarHandleBase *> delayed_vars;
auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) {
pending_vars.insert(&var);
......@@ -73,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Step 2. Insert FetchOps
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
std::vector<DummyVarHandle> dummy_vars;
FeedFetchList fetch_data(fetch_tensors.size());
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
......@@ -87,13 +100,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
auto &var_name = fetch_tensors[i];
auto &vars = fetched_vars.at(var_name);
auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_);
fetch_ops.emplace_back(op);
// FIXME: Use new device context
for (auto &p : places_) {
op->dev_ctxes_[p] = fetch_ctxs_.Get(p);
}
......@@ -101,12 +114,24 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for (auto *var : vars) {
op->AddInput(var);
}
auto *fetch_dummy = new DummyVarHandle();
op->AddOutput(fetch_dummy);
fetch_dependencies.emplace(fetch_dummy);
InsertPendingVar(*fetch_dummy);
InsertPendingOp(*op);
}
auto run_all_ready_ops = [&] {
for (auto *op : ready_ops) {
RunOp(ready_vars, op);
if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
delayed_ops.insert(op);
delayed_vars.insert(op->outputs_.begin(), op->outputs_.end());
ready_vars.Extend(op->outputs_);
continue;
}
running_ops_++;
RunOp(&ready_vars, op);
}
ready_ops.clear();
};
......@@ -118,13 +143,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
// Step 3. Execution
while (!pending_vars.empty()) {
while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
// 1. Run All Ready ops
run_all_ready_ops();
// 2. Find ready variable
bool timeout;
auto cur_ready_vars = ready_vars.PopAll(1000, &timeout);
auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
if (timeout) {
if (exception_) {
......@@ -141,13 +166,29 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
if (delayed_vars.find(ready_var) != delayed_vars.end()) {
blocked_by_delayed_ops.insert(op);
} else {
ready_ops.insert(op);
}
}
}
}
// When there are no other ops to schedule, schedule buffered delayed
// ops and unblock other ops.
if (ready_ops.empty() && !delayed_ops.empty() && running_ops_ == 0) {
RunDelayedOps(delayed_ops);
delayed_ops.clear();
for (auto *op : blocked_by_delayed_ops) {
ready_ops.insert(op);
}
blocked_by_delayed_ops.clear();
}
// Keep loop until all vars are ready.
}
PADDLE_ENFORCE(ready_ops.empty());
PADDLE_ENFORCE(delayed_ops.empty());
PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
++computation_count_;
auto sync_computation = [&] {
......@@ -182,12 +223,13 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
void ThreadedSSAGraphExecutor::RunOp(
BlockingQueue<VarHandleBase *> &ready_var_q, details::OpHandleBase *op) {
auto op_run = [&ready_var_q, op, this] {
BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
auto op_run = [ready_var_q, op, this] {
try {
VLOG(10) << op->Name() << " : " << op->DebugString();
op->Run(use_event_);
ready_var_q.Extend(op->outputs_);
running_ops_--;
ready_var_q->Extend(op->outputs_);
} catch (platform::EnforceNotMet ex) {
exception_.reset(new platform::EnforceNotMet(ex));
} catch (...) {
......
......@@ -14,7 +14,12 @@
#pragma once
#include <chrono>
#include <deque>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>
#include <functional>
#include "ThreadPool.h" // ThreadPool in thrird party
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
......@@ -70,7 +75,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph);
std::unique_ptr<SSAGraph> &&graph,
bool allow_op_delay);
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
......@@ -79,9 +85,11 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~ThreadedSSAGraphExecutor() {}
private:
void RunOp(BlockingQueue<VarHandleBase *> &ready_var_q,
void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
details::OpHandleBase *op);
void RunDelayedOps(const std::unordered_set<OpHandleBase *> &delayed_ops);
private:
std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_;
......@@ -89,6 +97,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
platform::DeviceContextPool fetch_ctxs_;
const bool use_event_;
std::unique_ptr<platform::EnforceNotMet> exception_;
std::atomic<int> running_ops_;
bool allow_op_delay_;
size_t computation_count_{0};
size_t max_async_computation{100};
......
......@@ -279,6 +279,21 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
return std::unique_ptr<ExecutorPrepareContext>(ctx);
}
std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids) {
std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
for (auto& bid : block_ids) {
auto* ctx = new ExecutorPrepareContext(program, bid);
PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
auto& block = program.Block(bid);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
}
return result;
}
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
......
......@@ -61,6 +61,9 @@ class Executor {
static std::unique_ptr<ExecutorPrepareContext> Prepare(
const ProgramDesc& program, int block_id);
static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
const ProgramDesc& program, const std::vector<int>& block_ids);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
......
......@@ -142,6 +142,7 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1;
}
// Split LoDTensor and copy to each place specified in places.
std::vector<LoDTensor> SplitLoDTensor(
const std::vector<platform::Place> places) const;
......
......@@ -35,6 +35,17 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
};
proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
if (var->IsType<framework::LoDTensor>()) {
return framework::ToDataType(var->Get<framework::LoDTensor>().type());
} else if (var->IsType<framework::SelectedRows>()) {
return framework::ToDataType(
var->Get<framework::SelectedRows>().value().type());
} else {
PADDLE_THROW("Var should be LoDTensor or SelectedRows");
}
}
static DDim GetDims(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
......
......@@ -61,6 +61,8 @@ inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix;
}
proto::VarType::Type GetDataTypeOfVar(const Variable* var);
class OperatorBase;
class ExecutionContext;
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/parallel_executor.h"
#include "paddle/fluid/platform/profiler.h"
#include <string>
#include <vector>
......@@ -47,7 +48,7 @@ ParallelExecutor::ParallelExecutor(
const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program, const ProgramDesc &main_program,
const std::string &loss_var_name, Scope *scope)
const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
: member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope;
......@@ -82,8 +83,8 @@ ParallelExecutor::ParallelExecutor(
auto graph = builder.Build(main_program);
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
num_threads, use_event, member_->local_scopes_, places,
std::move(graph)));
num_threads, use_event, member_->local_scopes_, places, std::move(graph),
allow_op_delay));
// Step 3. Create vars in each scope;
for (auto *scope : member_->local_scopes_) {
......@@ -149,12 +150,30 @@ void ParallelExecutor::BCastParamsToGPUs(
#endif
}
void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name) {
void ParallelExecutor::Run(
const std::vector<std::string> &fetch_tensors,
const std::string &fetched_var_name,
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
platform::RecordBlock b(0);
SplitTensorToPlaces(feed_tensors);
auto fetch_data = member_->executor_->Run(fetch_tensors);
*member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
fetch_data;
}
void ParallelExecutor::SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
for (auto it : feed_tensors) {
auto lod_tensors = it.second.SplitLoDTensor(member_->places_);
for (size_t j = 0; j < member_->places_.size(); ++j) {
// TODO(panxy0718): Do I need to delete this var?
member_->local_scopes_[j]
->Var(it.first)
->GetMutable<LoDTensor>()
->ShareDataWith(lod_tensors[j]);
}
}
}
} // namespace framework
} // namespace paddle
......@@ -14,8 +14,9 @@ limitations under the License. */
#pragma once
#include <future>
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -37,12 +38,17 @@ class ParallelExecutor {
const std::unordered_set<std::string>& params,
const ProgramDesc& startup_program,
const ProgramDesc& main_program,
const std::string& loss_var_name, Scope* scope);
const std::string& loss_var_name, Scope* scope,
bool allow_op_delay);
void Run(const std::vector<std::string>& fetch_tensors,
const std::string& fetched_var_name = "fetched_var");
const std::string& fetched_var_name,
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
private:
void SplitTensorToPlaces(
const std::unordered_map<std::string, LoDTensor>& feed_tensors);
ParallelExecutorPrivate* member_;
void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -13,6 +16,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx) {
{ // the 1st field, uint32_t version
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -47,6 +50,15 @@ class SelectedRows {
void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
/**
* get the index of id in rows
*/
int64_t index(int64_t id) const {
auto it = std::find(rows_.begin(), rows_.end(), id);
PADDLE_ENFORCE(it != rows_.end(), "id should be in rows");
return static_cast<int64_t>(std::distance(rows_.begin(), it));
}
DDim GetCompleteDims() const {
std::vector<int64_t> dims = vectorize(value_->dims());
dims[0] = height_;
......
......@@ -45,11 +45,10 @@ class Tensor {
friend struct EigenVector;
public:
Tensor() : offset_(0), is_pinned_(false) {}
Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
......@@ -70,12 +69,11 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
......@@ -86,8 +84,7 @@ class Tensor {
* @note If not exist, then allocation.
*/
template <typename T>
inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
inline T* mutable_data(DDim dims, platform::Place place);
/*! Return the dimensions of the memory block. */
inline const DDim& dims() const;
......@@ -95,9 +92,6 @@ class Tensor {
/*! Return the numel of the memory block. */
inline int64_t numel() const;
/*! Return the numel of the memory block. */
inline bool isPinned() const;
/*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims);
......@@ -152,14 +146,12 @@ class Tensor {
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type,
bool is_pinned = false)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size),
type_(type),
is_pinned_(is_pinned) {
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
......@@ -182,9 +174,6 @@ class Tensor {
/* the current type of memory */
std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
};
/*! holds the memory block if allocated. */
......@@ -219,7 +208,6 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins.
*/
size_t offset_;
bool is_pinned_;
};
inline void Tensor::switch_place(platform::Place new_place) {
......
......@@ -101,21 +101,19 @@ inline T* Tensor::data() {
}
template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims);
return mutable_data<T>(place, is_pinned);
return mutable_data<T>(place);
}
template <typename T>
inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
......@@ -129,27 +127,33 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type, is_pinned));
} else if (platform::is_gpu_place(place)) {
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
PADDLE_THROW(
"CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
}
#else
if (platform::is_gpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
boost::get<platform::CUDAPlace>(place), size, type));
} else if (platform::is_cuda_pinned_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
boost::get<platform::CUDAPinnedPlace>(place), size, type));
}
}
#endif
offset_ = 0;
is_pinned_ = is_pinned;
}
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type(), is_pinned);
"Cannot invoke mutable data if current hold nothing.");
return mutable_data(place, holder_->type());
}
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
......@@ -191,8 +195,6 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith(src);
......
......@@ -148,6 +148,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
const platform::CPUPlace& cpu) const {
return *out.data<bool>();
}
bool GetResult(const framework::Tensor& out,
const platform::CUDAPinnedPlace& cpu) const {
return *out.data<bool>();
}
};
template <typename Predicate>
......
......@@ -35,24 +35,25 @@ class Tuple {
public:
using ElementVars = std::vector<ElementVar>;
Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
Tuple(const std::vector<ElementVar>& var,
const std::vector<VarDesc>& var_desc)
: var_(var), var_desc_(var_desc) {}
Tuple(std::vector<ElementVar>& var) : var_(var) {}
explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
ElementVar get(int idx) const { return var_[idx]; };
ElementVar get(int idx) const { return var_[idx]; }
ElementVar& get(int idx) { return var_[idx]; };
ElementVar& get(int idx) { return var_[idx]; }
bool isSameType(Tuple& t) const;
bool isSameType(const Tuple& t) const;
size_t getSize() const { return var_.size(); };
size_t getSize() const { return var_.size(); }
private:
ElementVars var_;
std::vector<VarDesc> var_desc_;
};
bool Tuple::isSameType(Tuple& t) const {
bool Tuple::isSameType(const Tuple& t) const {
size_t tuple_size = getSize();
if (tuple_size != t.getSize()) {
return false;
......
......@@ -41,8 +41,7 @@ bool IsPersistable(const framework::VarDesc* var) {
return false;
}
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename) {
......@@ -108,10 +107,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
}
std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor& executor,
framework::Scope& scope,
const std::string& prog_filename,
const std::string& param_filename) {
framework::Executor& executor, framework::Scope& scope,
const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str;
ReadBinaryFile(model_filename, program_desc_str);
......
......@@ -24,8 +24,7 @@ limitations under the License. */
namespace paddle {
namespace inference {
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
const framework::ProgramDesc& main_program,
const std::string& dirname,
const std::string& param_filename);
......
......@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
set(arg_list "")
if(inference_test_ARGS)
foreach(arg ${inference_test_ARGS})
......
......@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -30,8 +30,8 @@ TEST(inference, fit_a_line) {
// The second dim of the input tensor should be 13
// The input data should be >= 0
int64_t batch_size = 10;
SetupTensor<float>(
input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
SetupTensor<float>(&input, {batch_size, 13}, static_cast<float>(0),
static_cast<float>(10));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -35,10 +35,8 @@ TEST(inference, image_classification) {
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
SetupTensor<float>(input,
{FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0),
static_cast<float>(1));
SetupTensor<float>(&input, {FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0), static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
......@@ -48,8 +46,8 @@ TEST(inference, image_classification) {
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
FLAGS_repeat);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
......@@ -59,8 +57,8 @@ TEST(inference, image_classification) {
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
FLAGS_repeat);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
int64_t predicate_dict_len = 3162;
int64_t mark_dict_len = 2;
SetupLoDTensor(word,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(predicate,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
static_cast<int64_t>(predicate_dict_len - 1));
SetupLoDTensor(ctx_n2,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_n1,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_0,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p1,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(ctx_p2,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
SetupLoDTensor(mark,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
static_cast<int64_t>(mark_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(input,
{FLAGS_batch_size, 1, 28, 28},
static_cast<float>(-1),
static_cast<float>(1));
SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
static_cast<float>(-1), static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
......@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
// Run inference on CPU
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
......@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
// Run inference on CUDA GPU
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
// Use the first data from paddle.dataset.movielens.test() as input
std::vector<int64_t> user_id_data = {1};
SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
std::vector<int64_t> gender_id_data = {1};
SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
std::vector<int64_t> age_id_data = {0};
SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
std::vector<int64_t> job_id_data = {10};
SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
std::vector<int64_t> movie_id_data = {783};
SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
std::vector<int64_t> category_id_data = {10, 8, 9};
SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&user_id);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
paddle::framework::LoDTensor word_data, trg_word;
paddle::framework::LoD lod{{0, 4, 10}};
SetupLoDTensor(
word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(
trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
static_cast<int64_t>(1));
SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
static_cast<int64_t>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&word_data);
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
paddle::framework::LoD lod{{0, 4, 10}};
int64_t word_dict_len = 5147;
SetupLoDTensor(words,
lod,
static_cast<int64_t>(0),
SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
static_cast<int64_t>(word_dict_len - 1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
......
......@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
......@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
paddle::framework::LoD lod{{0, 1}};
int64_t dict_size = 2073; // The size of dictionary
SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&first_word);
......
......@@ -11,59 +11,59 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <random>
#include <string>
#include <vector>
#include <time.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
T lower,
T upper) {
srand(time(0));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input.numel(); ++i) {
input_ptr[i] =
(static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
lower;
void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, T lower, T upper) {
std::mt19937 rng(100); // An arbitrarily chosen but fixed seed.
std::uniform_real_distribution<double> uniform_dist(0, 1);
T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
for (int i = 0; i < input->numel(); ++i) {
input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
paddle::framework::DDim dims,
std::vector<T>& data) {
void SetupTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims, const std::vector<T>& data) {
CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
}
template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input,
paddle::framework::LoD& lod,
T lower,
T upper) {
input.set_lod(lod);
void SetupLoDTensor(paddle::framework::LoDTensor* input,
const paddle::framework::LoD& lod, T lower, T upper) {
input->set_lod(lod);
int dim = lod[0][lod[0].size() - 1];
SetupTensor<T>(input, {dim, 1}, lower, upper);
}
template <typename T>
void SetupLoDTensor(paddle::framework::LoDTensor& input,
void SetupLoDTensor(paddle::framework::LoDTensor* input,
paddle::framework::DDim dims,
paddle::framework::LoD lod,
std::vector<T>& data) {
const paddle::framework::LoD lod,
const std::vector<T>& data) {
const size_t level = lod.size() - 1;
CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
input.set_lod(lod);
input->set_lod(lod);
SetupTensor<T>(input, dims, data);
}
template <typename T>
void CheckError(paddle::framework::LoDTensor& output1,
paddle::framework::LoDTensor& output2) {
void CheckError(const paddle::framework::LoDTensor& output1,
const paddle::framework::LoDTensor& output2) {
// Check lod information
EXPECT_EQ(output1.lod(), output2.lod());
......@@ -91,9 +91,8 @@ void CheckError(paddle::framework::LoDTensor& output1,
template <typename Place>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1,
const bool is_combined = false) {
const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1, const bool is_combined = false) {
// 1. Define place, executor, scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
......@@ -132,10 +131,8 @@ void TestInference(const std::string& dirname,
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program =
paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
inference_program = paddle::inference::Load(
executor, *scope, dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
......@@ -14,3 +14,7 @@ cc_library(paddle_memory
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
#if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place paddle_memory)
#endif()
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
......@@ -134,21 +135,31 @@ bool GPUAllocator::UseGpu() const { return true; }
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable =
paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
if (size > usable) {
LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
<< " MB pinned memory."
<< ", available " << usable / 1024.0 / 1024.0 << " MB";
return nullptr;
}
void* p;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
return p;
} else {
LOG(WARNING) << "cudaMallocHost failed.";
return nullptr;
}
return nullptr;
......@@ -158,8 +169,8 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
cuda_pinnd_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
......@@ -172,7 +183,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
......
......@@ -21,8 +21,9 @@ namespace memory {
namespace detail {
/**
* \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
* A BuddyAllocator object uses a SystemAllocator* pointing to the
* \brief SystemAllocator is the parent class of CPUAllocator,
* CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
* object uses a SystemAllocator* pointing to the
* underlying system allocator.
*/
class SystemAllocator {
......@@ -62,9 +63,7 @@ class CUDAPinnedAllocator : public SystemAllocator {
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
size_t cuda_pinnd_alloc_size_ = 0;
};
#endif
......
......@@ -56,6 +56,45 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
}
}
template <>
void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
platform::CPUPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CPUPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
std::memcpy(dst, src, num);
}
template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}
template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
platform::SetDeviceId(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}
#endif
} // namespace memory
......
......@@ -38,8 +38,7 @@ BuddyAllocator* GetCPUBuddyAllocator() {
}
template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p;
......@@ -47,8 +46,7 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
}
template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
......@@ -84,52 +82,20 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
return as[gpu_id];
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used();
}
template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
bool is_pinned) {
void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
auto* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device);
size_t avail, total;
platform::GpuMemoryUsage(avail, total);
platform::GpuMemoryUsage(&avail, &total);
LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
<< place.device << ", available " << avail << " bytes";
LOG(WARNING) << "total " << total;
......@@ -142,15 +108,42 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
}
template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
GetGPUBuddyAllocator(place.device)->Free(p);
}
BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
static BuddyAllocator* ba = NULL;
if (ba == NULL) {
ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::CUDAPinnedMinChunkSize(),
platform::CUDAPinnedMaxChunkSize());
}
return ba;
}
template <>
size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
return GetCUDAPinnedBuddyAllocator()->Used();
}
template <>
void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
size_t size) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
void* ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
<< " bytes in CUDAPinnedPlace";
}
return ptr;
}
template <>
void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
GetCUDAPinnedBuddyAllocator()->Free(p);
}
#endif
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
......@@ -165,6 +158,14 @@ size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#endif
}
size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
return Used(cuda_pinned);
#else
PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
#endif
}
size_t memory_usage(const platform::Place& p) {
return boost::apply_visitor(Usage(), p);
}
......
......@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not.
*/
template <typename Place>
void* Alloc(Place place, size_t size, bool is_pinned = false);
void* Alloc(Place place, size_t size);
/**
* \brief Free memory block in one place.
......@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool is_pinned = false);
*
*/
template <typename Place>
void Free(Place place, void* ptr, bool is_pinned = false);
void Free(Place place, void* ptr);
/**
* \brief Total size of used memory in one place.
......@@ -57,6 +57,7 @@ size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
};
size_t memory_usage(const platform::Place& p);
......@@ -74,13 +75,11 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD");
public:
explicit PODDeleter(Place place, bool is_pinned = false)
: place_(place), is_pinned_(is_pinned) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
explicit PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
bool is_pinned_;
};
/**
......
......@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include <unordered_map>
#include "gtest/gtest.h"
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include <unordered_map>
inline bool is_aligned(void const *p) {
return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
}
......@@ -141,4 +141,59 @@ TEST(BuddyAllocator, GPUMultAlloc) {
}
}
size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
size += sizeof(paddle::memory::detail::Metadata);
size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
size_t remaining = size % alignment;
return remaining == 0 ? size : size + (alignment - remaining);
}
TEST(BuddyAllocator, CUDAPinnedAllocator) {
void *p = nullptr;
EXPECT_EQ(p, nullptr);
paddle::platform::CUDAPinnedPlace cpu;
p = paddle::memory::Alloc(cpu, 4096);
EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p);
}
TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
paddle::platform::CUDAPinnedPlace cpu;
std::unordered_map<void *, size_t> ps;
size_t total_size = paddle::memory::Used(cpu);
EXPECT_EQ(total_size, 0UL);
for (auto size :
{0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(size, cpu);
total_size += aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
for (auto p : ps) {
EXPECT_EQ(is_aligned(p.first), true);
paddle::memory::Free(cpu, p.first);
// Buddy Allocator doesn't manage too large memory chunk
if (paddle::memory::Used(cpu) == total_size) continue;
size_t aligned_size = align(p.second, cpu);
total_size -= aligned_size;
EXPECT_EQ(total_size, paddle::memory::Used(cpu));
}
}
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <unordered_map>
#include "paddle/fluid/memory/detail/memory_block.h"
#include "paddle/fluid/memory/detail/meta_data.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
// This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster.
template <typename T>
__global__ void Kernel(T* output, int dim) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < dim) {
output[tid] = output[tid] * output[tid] / 100;
}
}
template <typename Place>
float test_pinned_memory() {
Place cpu_place;
paddle::platform::CUDAPlace cuda_place;
const int data_size = 4096;
const int iteration = 10;
// create event start and end
cudaEvent_t start_e, stop_e, copying_e;
float elapsedTime = 0;
cudaEventCreate(&start_e);
cudaEventCreate(&stop_e);
cudaEventCreate(&copying_e);
// create computation stream, data copying stream
cudaStream_t computation_stream, copying_stream;
cudaStreamCreate(&computation_stream);
cudaStreamCreate(&copying_stream);
// create record event, pinned memory, gpu memory
std::vector<cudaEvent_t> record_event(iteration);
std::vector<float*> input_pinned_mem(iteration);
std::vector<float*> gpu_mem(iteration);
std::vector<float*> output_pinned_mem(iteration);
// initial data
for (int j = 0; j < iteration; ++j) {
cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
cudaEventCreate(&(record_event[j]));
input_pinned_mem[j] = static_cast<float*>(
paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
output_pinned_mem[j] = static_cast<float*>(
paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
gpu_mem[j] = static_cast<float*>(
paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
for (int k = 0; k < data_size; ++k) {
input_pinned_mem[j][k] = k;
}
}
cudaEventRecord(start_e, computation_stream);
// computation
for (int m = 0; m < 30; ++m) {
for (int i = 0; i < iteration; ++i) {
// cpu -> GPU on computation stream.
// note: this operation is async for pinned memory.
paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
input_pinned_mem[i], data_size * sizeof(float),
computation_stream);
// call kernel on computation stream.
Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
// record event_computation on computation stream
cudaEventRecord(record_event[i], computation_stream);
// wait event_computation on copy stream.
// note: this operation is async.
cudaStreamWaitEvent(copying_stream, record_event[i], 0);
// copy data GPU->CPU, on copy stream.
// note: this operation is async for pinned memory.
paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
gpu_mem[i], data_size * sizeof(float),
copying_stream);
}
}
cudaEventRecord(copying_e, copying_stream);
cudaStreamWaitEvent(computation_stream, copying_e, 0);
cudaEventRecord(stop_e, computation_stream);
cudaEventSynchronize(start_e);
cudaEventSynchronize(stop_e);
cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
// std::cout << cpu_place << " "
// << "time consume:" << elapsedTime / 30 << std::endl;
for (int l = 0; l < iteration; ++l) {
for (int k = 0; k < data_size; ++k) {
float temp = input_pinned_mem[l][k];
temp = temp * temp / 100;
EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
}
}
// destroy resource
cudaEventDestroy(copying_e);
cudaEventDestroy(start_e);
cudaEventDestroy(stop_e);
for (int j = 0; j < 10; ++j) {
cudaEventDestroy((record_event[j]));
paddle::memory::Free(cpu_place, input_pinned_mem[j]);
paddle::memory::Free(cpu_place, output_pinned_mem[j]);
paddle::memory::Free(cuda_place, gpu_mem[j]);
}
return elapsedTime / 30;
}
TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
// Generally speaking, operation on pinned_memory is faster than that on
// unpinned-memory, but if this unit test fails frequently, please close this
// test for the time being.
float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
EXPECT_GT(time1, time2);
}
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
......@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
......@@ -193,6 +193,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
else()
set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
......
......@@ -128,10 +128,32 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
// Tensor core is supported since the volta GPU and
// is only enabled when input and filter data are float16
if (dev_ctx.GetComputeCapability() >= 70 &&
std::type_index(typeid(T)) ==
std::type_index(typeid(platform::float16))) {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
// Currently tensor core is only enabled using this algo
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else {
PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
cudnn_conv_desc, CUDNN_DEFAULT_MATH));
}
#endif
// get workspace size able to allocate
PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, algo, &workspace_size_in_bytes));
// It is possible for float16 on Volta GPU to allocate more memory than
// the limit because the algo is overrided to use tensor core.
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit");
// Allocate on GPU memory
platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
......
......@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
......
......@@ -17,7 +17,7 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
namespace paddle {
namespace operators {
......
......@@ -19,9 +19,11 @@ limitations under the License. */
#pragma once
#include <grpc++/grpc++.h>
#include <vector>
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "grpc++/grpc++.h"
namespace grpc {
// A ZeroCopyInputStream that reads from grpc_byte_buffer
......@@ -56,7 +58,7 @@ class GrpcBufferReader final
*data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
backup_count_;
GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
*size = (int)backup_count_;
*size = static_cast<int>(backup_count_);
backup_count_ = 0;
return true;
}
......@@ -68,7 +70,7 @@ class GrpcBufferReader final
*data = GRPC_SLICE_START_PTR(slice_);
// On win x64, int is only 32bit
GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
return true;
}
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#include "paddle/fluid/operators/detail/grpc_client.h"
#include <sys/time.h>
#include <limits>
#include "paddle/fluid/framework/threadpool.h"
......@@ -54,7 +56,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
});
req_count_++;
......@@ -66,7 +68,7 @@ void ProcGetResponse(const VarHandle& var_h,
// const sendrecv::VariableMessage& ret_msg) {
const ::grpc::ByteBuffer& ret_msg) {
framework::Variable* outvar = NULL;
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar);
DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
}
template <typename T>
......@@ -110,7 +112,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
call->StartCall();
call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
});
req_count_++;
......@@ -170,7 +172,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
}
......@@ -182,7 +184,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
req_count_++;
}
......
......@@ -14,10 +14,9 @@ limitations under the License. */
#pragma once
#include <grpc++/grpc++.h>
#include <grpc/support/log.h>
#include <time.h>
#include <chrono>
#include <chrono> // NOLINT
#include <ctime>
#include <functional>
#include <iostream>
......@@ -25,11 +24,11 @@ limitations under the License. */
#include <string>
#include <vector>
#include <grpc++/generic/generic_stub.h>
#include <grpc++/grpc++.h>
#include <grpc++/support/byte_buffer.h>
#include <grpc++/support/slice.h>
#include "grpc++/generic/generic_stub.h"
#include "grpc++/grpc++.h"
#include "grpc++/support/byte_buffer.h"
#include "grpc++/support/slice.h"
#include "grpc/support/log.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
......
......@@ -186,7 +186,8 @@ void AsyncGRPCServer::WaitClientGet(int count) {
void AsyncGRPCServer::RunSyncUpdate() {
::grpc::ServerBuilder builder;
builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials());
builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials(),
&selected_port_);
builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
builder.RegisterService(&service_);
......@@ -196,7 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() {
cq_prefetch_ = builder.AddCompletionQueue();
server_ = builder.BuildAndStart();
LOG(INFO) << "Server listening on " << address_ << std::endl;
LOG(INFO) << "Server listening on " << address_
<< " selected port: " << selected_port_;
std::function<void()> send_register =
std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
......@@ -273,7 +275,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() {
// FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq,
std::string cq_name,
const std::string& cq_name,
std::function<void()> TryToRegisterNewOne) {
TryToRegisterNewOne();
......
......@@ -14,10 +14,11 @@ limitations under the License. */
#pragma once
#include <grpc++/grpc++.h>
#include <string>
#include <thread> // NOLINT
#include <utility>
#include "grpc++/grpc++.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
......@@ -62,6 +63,8 @@ class AsyncGRPCServer final {
void SetExecutor(framework::Executor *executor) { executor_ = executor; }
int GetSelectedPort() { return selected_port_; }
const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); }
void Push(const std::string &msg_name) {
......@@ -71,7 +74,8 @@ class AsyncGRPCServer final {
void ShutDown();
protected:
void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name,
void HandleRequest(::grpc::ServerCompletionQueue *cq,
const std::string &cq_name,
std::function<void()> TryToRegisterNewOne);
void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne();
......@@ -109,6 +113,7 @@ class AsyncGRPCServer final {
int prefetch_blk_id_;
framework::ProgramDesc *program_;
framework::Executor *executor_;
int selected_port_;
};
}; // namespace detail
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/operators/detail/grpc_client.h"
......
......@@ -19,7 +19,9 @@ limitations under the License. */
#pragma once
#include <grpc++/grpc++.h>
#include <string>
#include "grpc++/grpc++.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......@@ -142,6 +144,6 @@ class ProtoEncodeHelper {
char* limit_; // Just for CHECKs
};
} // detail
} // operators
} // paddle
} // namespace detail
} // namespace operators
} // namespace paddle
......@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include <sys/time.h>
#include <thread>
#include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
......@@ -42,7 +44,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void* buf = malloc(1024);
void* payload = nullptr;
size_t payload_size;
ProtoEncodeHelper e((char*)buf, 1024);
ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
......@@ -152,7 +154,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
framework::proto::VarType_Type_SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2((char*)buf, 128);
ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
// NOTE: rows is of type int64_t
size_t rows_memory_size =
slr->rows().size() * framework::SizeOfType(typeid(int64_t));
......@@ -181,10 +183,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable*& var) {
framework::Variable** var) {
operators::detail::VariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
var = resp.GetVar();
*var = resp.GetVar();
}
} // namespace detail
......
......@@ -51,7 +51,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable*& var);
framework::Variable** var);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) {
......
......@@ -14,9 +14,9 @@ limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include <thread> // NOLINT
#include <google/protobuf/text_format.h>
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
......@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
}
for (int i = 0; i < rows2->size(); ++i) {
for (int64_t i = 0; i < rows2->size(); ++i) {
EXPECT_EQ(rows_data2[i], i);
}
EXPECT_EQ(slr2->height(), 1000);
......
......@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include <condition_variable>
#include <condition_variable> // NOLINT
#include <deque>
#include <mutex>
#include <mutex> // NOLINT
namespace paddle {
namespace operators {
......
......@@ -13,7 +13,11 @@
// limitations under the License.
#include "paddle/fluid/operators/detail/variable_response.h"
#include <string.h>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
......@@ -108,7 +112,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
bool VariableResponse::CopyLodTensorData(
::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
const platform::DeviceContext& ctx, const framework::DDim& dims,
int length) {
auto var = scope_->FindVar(meta_.varname());
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
......@@ -144,14 +149,15 @@ inline framework::DDim GetDims(
bool VariableResponse::CopySelectRowsTensorData(
::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, framework::DDim& dims, int length) {
const platform::DeviceContext& ctx, const framework::DDim& dims,
int length) {
auto var = scope_->FindVar(meta_.varname());
auto* slr = var->GetMutable<framework::SelectedRows>();
slr->set_height(meta_.slr_height());
auto* tensor = slr->mutable_value();
tensor->Resize(dims);
PADDLE_ENFORCE_EQ(
tensor->numel(),
static_cast<size_t>(tensor->numel()),
length / framework::SizeOfType(
paddle::operators::detail::ToTypeIndex(meta_.data_type())));
void* tensor_data = tensor->mutable_data(
......
......@@ -14,6 +14,8 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
......@@ -60,14 +62,14 @@ class VariableResponse {
private:
bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx,
framework::DDim& dims, int length);
const framework::DDim& dims, int length);
bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, int length);
bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx,
framework::DDim& dims, int length);
const framework::DDim& dims, int length);
private:
const framework::Scope* scope_;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/fc_op.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
template <typename T>
class MKLDNNMD {
public:
explicit MKLDNNMD(const T* in, const T* w, bool bias)
: in(paddle::framework::vectorize2int(in->dims())),
w(paddle::framework::vectorize2int(w->dims())) {
with_bias_ = bias;
}
mkldnn::memory::desc dst() const {
return platform::MKLDNNMemDesc({in[0], w[1]},
mkldnn::memory::data_type::f32,
mkldnn::memory::format::nc);
}
mkldnn::memory::desc src() const {
return is_spatial()
? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]},
mkldnn::memory::data_type::f32,
mkldnn::memory::format::nchw)
: platform::MKLDNNMemDesc({in[0], in[1]},
mkldnn::memory::data_type::f32,
mkldnn::memory::format::nc);
}
mkldnn::memory::desc weights() const {
return is_spatial()
? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]},
mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw)
: platform::MKLDNNMemDesc({w[1], in[1]},
mkldnn::memory::data_type::f32,
mkldnn::memory::format::oi);
}
mkldnn::memory::desc bias() const {
return with_bias_
? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32,
mkldnn::memory::format::format_undef)
: platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
mkldnn::memory::format::format_undef);
}
private:
bool is_spatial() const { return in.size() > 1 && w.size() > 1; }
std::vector<int> in;
std::vector<int> w;
bool with_bias_;
bool is_spatial_;
};
class MKLDNNMemory {
public:
MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
: md_(t), engine_(e) {}
virtual ~MKLDNNMemory() = default;
template <typename Output>
mkldnn::memory dst(const Output* out) {
return mkldnn::memory({md_->dst(), engine_},
static_cast<void*>(const_cast<float*>(out)));
}
template <typename Output>
mkldnn::memory dst(Output* out) {
return mkldnn::memory({md_->dst(), engine_}, out);
}
template <typename Input>
mkldnn::memory src(const Input* in) {
return mkldnn::memory({md_->src(), engine_},
static_cast<void*>(const_cast<float*>(in)));
}
template <typename Weight>
mkldnn::memory weights(const Weight* w) {
return mkldnn::memory({md_->weights(), engine_},
static_cast<void*>(const_cast<float*>(w)));
}
mkldnn::memory bias() {
return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_));
}
private:
MKLDNNMD<Tensor>* md_;
const mkldnn::engine& engine_;
};
template <typename T>
class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto input = ctx.Input<Tensor>("Input");
auto w = ctx.Input<Tensor>("W");
PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
"Input must be with 2 or 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
"Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
bool with_bias = ctx.Attr<bool>("bias_attr");
MKLDNNMD<Tensor> md(input, w, with_bias);
std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
with_bias, mkldnn_engine);
const std::string key = ctx.op().Output("Out");
const std::string key_fc_pd = key + "@fc_pd";
dev_ctx.SetBlob(key_fc_pd, pd);
MKLDNNMemory mem(&md, mkldnn_engine);
const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
auto output = ctx.Output<Tensor>("Out");
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto dst_memory = mem.dst(output_data);
auto src_memory = mem.src(input_data);
auto weights_memory = mem.weights(w_data);
auto bias_memory = mem.bias();
auto forward = with_bias ? mkldnn::inner_product_forward(
*pd, src_memory, weights_memory, bias_memory,
dst_memory)
: mkldnn::inner_product_forward(
*pd, src_memory, weights_memory, dst_memory);
std::vector<mkldnn::primitive> pipeline = {forward};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
private:
std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>
FcFwdPrimitiveDesc(const mkldnn::memory::desc& src,
const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& dst,
const mkldnn::memory::desc& bias, const bool with_bias,
const mkldnn::engine& engine) const {
auto desc = with_bias
? mkldnn::inner_product_forward::desc(
mkldnn::prop_kind::forward, src, weights, bias, dst)
: mkldnn::inner_product_forward::desc(
mkldnn::prop_kind::forward, src, weights, dst);
auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine);
return std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>(pd);
}
};
template <typename T>
class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
T* input_grad_data = nullptr;
T* w_grad_data = nullptr;
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (w_grad) {
w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
}
const Tensor* input = ctx.Input<Tensor>("Input");
const T* input_data = input->data<T>();
const Tensor* w = ctx.Input<Tensor>("W");
const T* w_data = w->data<T>();
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
const T* out_grad_data = out_grad->data<T>();
bool with_bias = ctx.Attr<bool>("bias_attr");
MKLDNNMD<Tensor> md(input, w, with_bias);
MKLDNNMemory mem(&md, mkldnn_engine);
auto dst_memory = mem.dst(out_grad_data);
auto src_memory = mem.src(input_data);
auto weights_memory = mem.weights(w_data);
auto bias_memory = mem.bias();
const std::string key = ctx.op().Input("Out");
const std::string key_fc_pd = key + "@fc_pd";
auto pd =
std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
dev_ctx.GetBlob(key_fc_pd));
PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context");
if (w_grad) {
auto weights_grad_memory = mem.weights(w_grad_data);
mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd =
FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
with_bias, *pd, mkldnn_engine);
auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
bwd_weight_pd, src_memory, dst_memory, weights_grad_memory,
bias_memory);
std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
auto src_grad_memory = mem.src(input_grad_data);
mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd =
FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd,
mkldnn_engine);
auto bwd_data_prim = mkldnn::inner_product_backward_data(
bwd_data_pd, dst_memory, weights_memory, src_grad_memory);
std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
}
private:
mkldnn::inner_product_backward_weights::primitive_desc
FcBwdWeightsPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias,
const bool with_bias,
const mkldnn::inner_product_forward::primitive_desc& pd,
const mkldnn::engine& engine) const {
auto bwd_weight_desc = with_bias
? mkldnn::inner_product_backward_weights::desc(
src, diff_weights, bias, diff_dst)
: mkldnn::inner_product_backward_weights::desc(
src, diff_weights, bias, diff_dst);
return mkldnn::inner_product_backward_weights::primitive_desc(
bwd_weight_desc, engine, pd);
}
mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc(
const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& diff_dst,
const mkldnn::inner_product_forward::primitive_desc& pd,
const mkldnn::engine& engine) const {
auto bwd_data_desc =
mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst);
return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc,
engine, pd);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::FCMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::FCMKLDNNGradOpKernel<float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fc_op.h"
#include <vector>
namespace paddle {
namespace operators {
void FCOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"X(Input) of Fully Connected should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Out(Output) of Fully Connected should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"),
"W(Input) of Fully Connected should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto w_dims = ctx->GetInputDim("W");
std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
ctx->ShareLoD("Input", "Out");
}
framework::OpKernelType FCOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout, library);
}
void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto w_dims = ctx->GetInputDim("W");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
}
if (ctx->HasOutput(framework::GradVarName("W"))) {
ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
}
}
framework::OpKernelType FCOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library{framework::LibraryType::kMKLDNN};
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout, library);
}
FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
AddInput("W", "(Tensor), The second input tensor of fc op.");
AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC(
Fully Connected Operator.
The fully connected operation calculates the output based on the input, weights and bias attribute.
The size of each dimension of the parameters checked in the infer-shape.
The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
Additional parametrs are use_mkldnn and bias_attr.
The input(X) size and output(Out) size may be diffrent.
The fully connected layer only supports MKLDNN version
)DOC");
}
} // namespace operators
} // namespace paddle
REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
paddle::operators::FCOpGrad);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class FCOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FCOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override;
};
class FCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
FCOpMaker(OpProto* proto, OpAttrChecker* op_checker);
};
} // namespace operators
} // namespace paddle
......@@ -12,20 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <ostream>
#include <thread>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/grpc_server.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
service->RunSyncUpdate();
VLOG(4) << "RunServer thread end";
......@@ -45,16 +39,19 @@ static void CreateTensorFromMessageType(framework::Variable *var,
}
}
static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
framework::Executor *executor,
framework::ProgramDesc *program,
framework::Scope *scope) {
static void ParallelExecuteBlocks(
const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
&prepared,
framework::ProgramDesc *program, framework::Scope *scope) {
std::vector<std::future<void>> fs;
for (size_t idx : parallel_blkids) {
fs.push_back(framework::Async([&executor, &program, &scope, idx]() {
fs.push_back(
framework::Async([&executor, &prepared, &program, &scope, idx]() {
int run_block = idx; // thread local
try {
executor->Run(*program, scope, run_block, false, false);
executor->RunPreparedContext(prepared[run_block].get(), scope,
false, false);
} catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what();
}
......@@ -63,49 +60,60 @@ static void ParallelExecuteBlocks(const std::vector<size_t> &parallel_blkids,
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
}
class ListenAndServOp : public framework::OperatorBase {
public:
ListenAndServOp(const std::string &type,
ListenAndServOp::ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
server_thread_.reset(new std::thread(RunServer, rpc_service_));
}
}
: OperatorBase(type, inputs, outputs, attrs) {}
void Stop() override {
int ListenAndServOp::GetSelectedPort() {
return rpc_service_->GetSelectedPort();
}
void ListenAndServOp::Stop() {
rpc_service_->Push(LISTEN_TERMINATE_MESSAGE);
server_thread_->join();
}
}
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
void ListenAndServOp::RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
framework::Scope &recv_scope = scope.NewScope();
// FIXME(Yancey1989): initialize rpc server with lazy mode.
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
if (!rpc_service_) {
std::string endpoint = Attr<std::string>("endpoint");
rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
}
auto ins = Inputs("X");
auto fan_in = Attr<int>("Fanin");
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program();
int num_blocks = program->Size();
size_t num_blocks = program->Size();
PADDLE_ENFORCE_GE(num_blocks, 2,
"server program should have at least 2 blocks");
framework::Executor executor(dev_place);
std::vector<int> block_list;
for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
block_list.push_back(blkid);
}
auto prepared = executor.Prepare(*program, block_list);
// Insert placeholder for block0 which holds current op itself.
prepared.insert(prepared.begin(),
std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
rpc_service_->SetScope(&recv_scope);
rpc_service_->SetDevCtx(&dev_ctx);
// TODO(qiao) set proper fields for table lookup and update
rpc_service_->SetExecutor(&executor);
rpc_service_->SetPrefetchBlkdId(0);
rpc_service_->SetProgram(program);
// start the server listening after all member initialized.
server_thread_.reset(new std::thread(RunServer, rpc_service_));
// FIXME(typhoonzero): do we need to wait until the server port is ready?
sleep(5);
// TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false;
......@@ -153,24 +161,22 @@ class ListenAndServOp : public framework::OperatorBase {
// The optimize blocks which have the same parent ID would run parallel
// TODO(Yancey1989): need to use ParallelExecutor for future
size_t last_parent_blkid = program->Block(1).Parent();
int32_t last_parent_blkid = program->Block(1).Parent();
std::vector<size_t> parallel_blkids;
parallel_blkids.push_back(1);
double ts = detail::GetTimestamp();
for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
if (program->Block(blkid).Parent() != last_parent_blkid) {
for (size_t idx : parallel_blkids) VLOG(3) << idx;
ParallelExecuteBlocks(parallel_blkids, &executor, program,
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
parallel_blkids.clear();
last_parent_blkid = program->Block(blkid).Parent();
}
parallel_blkids.push_back(blkid);
}
ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope);
VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts
<< "(ms)";
ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
&recv_scope);
VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
// Reset the received sparse variables, the sum operator would not
// sum the input sparse variables which rows is empty at the next
......@@ -185,12 +191,7 @@ class ListenAndServOp : public framework::OperatorBase {
rpc_service_->WaitClientGet(fan_in);
sparse_vars.clear();
} // while(true)
}
protected:
std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
std::shared_ptr<std::thread> server_thread_;
};
}
class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
public:
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <ostream>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/detail/grpc_server.h"
namespace paddle {
namespace operators {
constexpr char kOptimizeBlock[] = "OptimizeBlock";
void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
class ListenAndServOp : public framework::OperatorBase {
public:
ListenAndServOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs);
int GetSelectedPort();
void Stop() override;
void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override;
protected:
mutable std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
mutable std::shared_ptr<std::thread> server_thread_;
};
} // namespace operators
} // namespace paddle
......@@ -18,22 +18,6 @@ limitations under the License. */
namespace paddle {
namespace operators {
static inline framework::OpKernelType ExpectedKernelType(
const framework::ExecutionContext& ctx) {
auto* table_var = ctx.InputVar("W");
if (table_var->IsType<LoDTensor>()) {
return framework::OpKernelType(
framework::ToDataType(table_var->Get<LoDTensor>().type()),
ctx.device_context());
} else if (table_var->IsType<SelectedRows>()) {
return framework::OpKernelType(
framework::ToDataType(table_var->Get<SelectedRows>().value().type()),
ctx.device_context());
} else {
PADDLE_THROW("W should be LoDTensor or SelectedRows");
}
}
class LookupTableOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
......@@ -67,7 +51,8 @@ class LookupTableOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return ExpectedKernelType(ctx);
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
......@@ -138,7 +123,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return ExpectedKernelType(ctx);
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
......
......@@ -30,13 +30,7 @@ using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;
static constexpr int64_t kNoPadding = -1;
inline size_t getIndex(const std::vector<int64_t> &rows, int64_t value) {
auto it = std::find(rows.begin(), rows.end(), value);
PADDLE_ENFORCE(it != rows.end(), "id should be in rows");
return static_cast<size_t>(std::distance(rows.begin(), it));
}
constexpr int64_t kNoPadding = -1;
template <typename T>
class LookupTableKernel : public framework::OpKernel<T> {
......@@ -55,7 +49,9 @@ class LookupTableKernel : public framework::OpKernel<T> {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW("table only support LoDTensor and SelectedRows");
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
int64_t *ids;
......@@ -107,7 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
memset(output + i * row_width, 0, row_width * sizeof(T));
} else {
PADDLE_ENFORCE_GE(ids[i], 0);
auto id_index = getIndex(table_t.rows(), ids[i]);
auto id_index = table_t.index(ids[i]);
memcpy(output + i * row_width, table + id_index * row_width,
row_width * sizeof(T));
}
......@@ -128,7 +124,9 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
auto *table_t = context.Input<SelectedRows>("W");
table_dim = table_t->value().dims();
} else {
PADDLE_THROW("table only support LoDTensor and SelectedRows");
PADDLE_THROW(
"The parameter W of a LookupTable "
"must be either LoDTensor or SelectedRows");
}
bool is_sparse = context.Attr<bool>("is_sparse");
......
......@@ -322,6 +322,14 @@ void set_constant_with_place<platform::CPUPlace>(
TensorSetConstantCPU(tensor, value));
}
template <>
void set_constant_with_place<platform::CUDAPinnedPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstantCPU(tensor, value));
}
struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
TensorSetConstantWithPlace(const platform::DeviceContext& context,
framework::Tensor* tensor, float value)
......
......@@ -39,18 +39,33 @@ void gemm<platform::CUDADeviceContext, float16>(
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
"cublas fp16 gemm requires GPU compute capability >= 53");
cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
#if CUDA_VERSION >= 9000
if (context.GetComputeCapability() >= 70) {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_TENSOR_OP_MATH));
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
} else {
PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
CUBLAS_DEFAULT_MATH));
}
#endif
// cublasHgemm does true FP16 computation which is slow for non-Volta
// GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
// input/output in fp16, computation in fp32, which can also be accelerated
// using tensor cores in volta GPUs.
PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
CUDA_R_32F, algo));
}
template <>
......
......@@ -14,6 +14,8 @@ limitations under the License. */
#define EIGEN_USE_GPU
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
......@@ -95,6 +97,7 @@ template class SoftmaxCUDNNFunctor<double>;
template class SoftmaxGradCUDNNFunctor<float>;
template class SoftmaxGradCUDNNFunctor<double>;
template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
......
......@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct ValueClip {
HOSTDEVICE T operator()(const T& x) const {
const T kThreshold = -64.;
const T kThreshold = static_cast<T>(-64.);
return x < kThreshold ? kThreshold : x;
}
};
......
......@@ -15,8 +15,8 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <memory>
#include <mutex>
#include <thread>
#include <mutex> // NOLINT
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/init.h"
......@@ -43,7 +43,7 @@ const f::DDim kDims = {20, 20};
// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
......@@ -64,7 +64,7 @@ class NCCLTester : public ::testing::Test {
NCCLInitOp();
}
virtual void TearDown() override {
void TearDown() override {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
......
......@@ -73,7 +73,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
platform::CPUPlace());
ctx.device_context());
}
};
......@@ -171,6 +171,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
ops::PriorBoxOpKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/prior_box_op.h"
namespace paddle {
namespace operators {
template <typename T>
__device__ inline T clip(T in) {
return min(max(in, 0.), 1.);
}
template <typename T>
__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
const int width, const int im_height,
const int im_width, const int as_num,
const T offset, const T step_width,
const T step_height, const T* min_sizes,
const T* max_sizes, const int min_num,
bool is_clip) {
int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
int box_num = height * width * num_priors;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
i += blockDim.x * gridDim.x) {
int h = i / (num_priors * width);
int w = (i / num_priors) % width;
int p = i % num_priors;
int m = max_sizes ? p / (as_num + 1) : p / as_num;
T cx = (w + offset) * step_width;
T cy = (h + offset) * step_height;
T bw, bh;
T min_size = min_sizes[m];
if (max_sizes) {
int s = p % (as_num + 1);
if (s < as_num) {
T ar = aspect_ratios[s];
bw = min_size * sqrt(ar) / 2.;
bh = min_size / sqrt(ar) / 2.;
} else {
T max_size = max_sizes[m];
bw = sqrt(min_size * max_size) / 2.;
bh = bw;
}
} else {
int s = p % as_num;
T ar = aspect_ratios[s];
bw = min_size * sqrt(ar) / 2.;
bh = min_size / sqrt(ar) / 2.;
}
T xmin = (cx - bw) / im_width;
T ymin = (cy - bh) / im_height;
T xmax = (cx + bw) / im_width;
T ymax = (cy + bh) / im_height;
out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
}
}
template <typename T>
__global__ void SetVariance(T* out, const T* var, const int vnum,
const int num) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
i += blockDim.x * gridDim.x) {
out[i] = var[i % vnum];
}
}
template <typename T>
class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<paddle::framework::Tensor>("Input");
auto* image = ctx.Input<paddle::framework::Tensor>("Image");
auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
auto variances = ctx.Attr<std::vector<float>>("variances");
auto flip = ctx.Attr<bool>("flip");
auto clip = ctx.Attr<bool>("clip");
std::vector<float> aspect_ratios;
ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
T offset = static_cast<T>(ctx.Attr<float>("offset"));
auto im_width = image->dims()[3];
auto im_height = image->dims()[2];
auto width = input->dims()[3];
auto height = input->dims()[2];
T step_width, step_height;
if (step_w == 0 || step_h == 0) {
step_width = static_cast<T>(im_width) / width;
step_height = static_cast<T>(im_height) / height;
} else {
step_width = step_w;
step_height = step_h;
}
int num_priors = aspect_ratios.size() * min_sizes.size();
if (max_sizes.size() > 0) {
num_priors += max_sizes.size();
}
int min_num = static_cast<int>(min_sizes.size());
int box_num = width * height * num_priors;
int block = 512;
int grid = (box_num + block - 1) / block;
auto stream =
ctx.template device_context<platform::CUDADeviceContext>().stream();
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
framework::Tensor r;
framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
framework::Tensor min;
framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
T* max_data = nullptr;
framework::Tensor max;
if (max_sizes.size() > 0) {
framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
max_data = max.data<T>();
}
GenPriorBox<T><<<grid, block, 0, stream>>>(
boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
max_data, min_num, clip);
framework::Tensor v;
framework::TensorFromVector(variances, ctx.device_context(), &v);
grid = (box_num * 4 + block - 1) / block;
SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
variances.size(), box_num * 4);
}
}; // namespace operators
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
ops::PriorBoxOpCUDAKernel<double>);
......@@ -51,7 +51,7 @@ struct ClipFunctor {
}
};
template <typename Place, typename T>
template <typename T>
class PriorBoxOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
......@@ -106,49 +106,24 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
int idx = 0;
for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s];
// first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size / 2.;
// xmin
// priors with different aspect ratios
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
float ar = aspect_ratios[r];
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
if (max_sizes.size() > 0) {
auto max_size = max_sizes[s];
// second prior: aspect_ratio = 1,
// size = sqrt(min_size * max_size)
// square prior with size sqrt(minSize * maxSize)
box_width = box_height = sqrt(min_size * max_size) / 2.;
// xmin
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
// rest of priors
for (size_t r = 0; r < aspect_ratios.size(); ++r) {
float ar = aspect_ratios[r];
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
// xmin
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
......
......@@ -21,6 +21,22 @@ namespace reader {
class MultipleReader : public framework::ReaderBase {
public:
class ThreadBufferMap {
public:
std::vector<framework::LoDTensor>& operator[](
const std::thread::id& thread_id) {
std::lock_guard<std::mutex> lock(mutex_);
return buffer_[thread_id];
}
void Clear() { buffer_.clear(); }
private:
std::mutex mutex_;
std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
buffer_;
};
MultipleReader(const std::vector<std::string>& file_names,
const std::vector<framework::DDim>& dims, size_t thread_num)
: file_names_(file_names), dims_(dims) {
......@@ -47,28 +63,27 @@ class MultipleReader : public framework::ReaderBase {
framework::Channel<size_t>* waiting_file_idx_;
framework::Channel<size_t>* available_thread_idx_;
framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
mutable std::vector<framework::LoDTensor> local_buffer_;
mutable ThreadBufferMap thread_buffer_map_;
};
void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
if (local_buffer_.empty()) {
buffer_->Receive(&local_buffer_);
}
*out = local_buffer_;
local_buffer_.clear();
auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
*out = thread_local_buffer;
thread_local_buffer.clear();
}
bool MultipleReader::HasNext() const {
return local_buffer_.empty() ? buffer_->Receive(&local_buffer_) : true;
auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
: true;
}
void MultipleReader::ReInit() {
EndScheduler();
local_buffer_.clear();
thread_buffer_map_.Clear();
StartNewScheduler();
}
......@@ -176,7 +191,7 @@ class OpenFilesOp : public framework::OperatorBase {
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
static_cast<int>(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
const auto& file_names = Attr<std::vector<std::string>>("file_names");
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/reshape_op.h"
#include <string>
#include <vector>
namespace paddle {
namespace operators {
......
......@@ -14,6 +14,9 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/listen_and_serv_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/string/printf.h"
......@@ -34,6 +35,7 @@ namespace m = paddle::operators::math;
// global for simplicity.
std::unique_ptr<f::OperatorBase> listen_and_serv_op;
int selected_port;
void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
p::CPUDeviceContext ctx(place);
......@@ -128,14 +130,16 @@ void StartServerNet(bool is_sparse) {
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
f::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
attrs.insert({"endpoint", std::string("127.0.0.1:0")});
attrs.insert({"Fanin", 1});
attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})});
attrs.insert({"OptimizeBlock", optimize_block});
listen_and_serv_op =
f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
LOG(INFO) << "selected port before run " << selected_port;
listen_and_serv_op->Run(scope, place);
LOG(INFO) << "server exit";
}
TEST(SendRecvOp, CPUDense) {
......@@ -149,12 +153,19 @@ TEST(SendRecvOp, CPUDense) {
scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs;
attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
selected_port = static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get())
->GetSelectedPort();
LOG(INFO) << "selected port " << selected_port;
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})});
auto send_op = f::OpRegistry::CreateOp(
"send", {{"X", {"x1"}}},
{{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
LOG(INFO) << "before run " << endpoint;
send_op->Run(scope, place);
LOG(INFO) << "end run";
auto in_var = scope.Var("x1");
auto tensor = in_var->GetMutable<f::LoDTensor>();
......@@ -167,6 +178,7 @@ TEST(SendRecvOp, CPUDense) {
for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]);
}
LOG(INFO) << "before stop";
listen_and_serv_op->Stop();
server_thread.join();
listen_and_serv_op.reset(nullptr);
......@@ -182,8 +194,13 @@ TEST(SendRecvOp, CPUSparse) {
InitSelectedRowsInScope(scope, place);
scope.Var("RPC_CLIENT_VAR");
f::AttributeMap attrs;
attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
selected_port = static_cast<paddle::operators::ListenAndServOp *>(
listen_and_serv_op.get())
->GetSelectedPort();
LOG(INFO) << "selected port " << selected_port;
std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
attrs.insert({"epmap", std::vector<std::string>({endpoint})});
auto send_op = f::OpRegistry::CreateOp(
"send", {{"X", {"x1"}}},
{{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
......
......@@ -43,9 +43,8 @@ class SGDOp : public framework::OperatorWithKernel {
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
ctx.GetPlace());
auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
return framework::OpKernelType(data_type, ctx.device_context());
}
};
......@@ -53,10 +52,12 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Param", "(Tensor or SelectedRows) Input parameter");
AddInput("LearningRate", "(Tensor) Learning rate of SGD");
AddInput("Grad", "(Tensor) Input gradient");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
AddOutput("ParamOut",
"(Tensor or SelectedRows, same with Param) "
"Output parameter, should share the same memory with Param");
AddComment(R"DOC(
SGD operator
......
......@@ -23,21 +23,25 @@ namespace operators {
template <typename T>
class SGDOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* param = ctx.Input<framework::Tensor>("Param");
auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param_var = ctx.InputVar("Param");
const auto *grad_var = ctx.InputVar("Grad");
if (param_var->IsType<framework::LoDTensor>()) {
const auto *param = ctx.Input<framework::Tensor>("Param");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
auto* grad_var = ctx.InputVar("Grad");
// Actually, all tensors are LoDTensor except SelectedRows.
if (grad_var->IsType<framework::LoDTensor>()) {
param_out->mutable_data<T>(ctx.GetPlace());
auto* grad = ctx.Input<framework::Tensor>("Grad");
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto p = framework::EigenVector<T>::Flatten(*param);
auto g = framework::EigenVector<T>::Flatten(*grad);
auto o = framework::EigenVector<T>::Flatten(*param_out);
auto* lr = learning_rate->data<T>();
auto *lr = learning_rate->data<T>();
o = p - lr[0] * g;
} else if (grad_var->IsType<framework::SelectedRows>()) {
......@@ -45,7 +49,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
// This manual optimization brings difficulty to track data dependency.
// It's better to find a more elegant solution.
PADDLE_ENFORCE_EQ(param, param_out);
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
// for distributed training, a sparse var may be empty,
// just skip updating.
......@@ -53,31 +57,64 @@ class SGDOpKernel : public framework::OpKernel<T> {
return;
}
auto in_height = grad->height();
auto grad_height = grad->height();
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
auto& in_value = grad->value();
auto& in_rows = grad->rows();
auto &grad_value = grad->value();
auto &grad_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
size_t grad_row_numel = grad_value.numel() / grad_rows.size();
PADDLE_ENFORCE_EQ(grad_row_numel, param_out->numel() / grad_height);
auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>();
auto* lr = learning_rate->data<T>();
for (size_t i = 0; i < in_rows.size(); i++) {
PADDLE_ENFORCE(in_rows[i] < in_height,
auto *grad_data = grad_value.data<T>();
auto *out_data = param_out->data<T>();
auto *lr = learning_rate->data<T>();
for (size_t i = 0; i < grad_rows.size(); i++) {
PADDLE_ENFORCE(grad_rows[i] < grad_height,
"Input rows index should less than height");
for (int64_t j = 0; j < in_row_numel; j++) {
out_data[in_rows[i] * in_row_numel + j] -=
lr[0] * in_data[i * in_row_numel + j];
for (int64_t j = 0; j < grad_row_numel; j++) {
out_data[grad_rows[i] * grad_row_numel + j] -=
lr[0] * grad_data[i * grad_row_numel + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
}
} else if (param_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
"when param "
"is SelectedRows, gradient should also be SelectedRows");
const auto &param = param_var->Get<framework::SelectedRows>();
auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
const auto &grad = grad_var->Get<framework::SelectedRows>();
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad.rows().size() == 0) {
return;
}
size_t param_row_width = param.value().numel() / param.rows().size();
size_t grad_row_width = grad.value().numel() / grad.rows().size();
PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
"param_row should have the same size with grad_row");
const auto *lr = learning_rate->data<T>();
const auto *grad_data = grad.value().data<T>();
auto *out_data = param_out->mutable_value()->data<T>();
for (size_t i = 0; i < grad.rows().size(); i++) {
PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
"Input rows index should less than height");
int64_t id_index = param.index(grad.rows()[i]);
for (int64_t j = 0; j < grad_row_width; j++) {
out_data[id_index * grad_row_width + j] -=
lr[0] * grad_data[i * grad_row_width + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Parameter");
}
}
};
} // namespace operators
......
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h"
#include <string>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
......@@ -20,6 +23,7 @@ limitations under the License. */
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -60,8 +64,8 @@ class SoftmaxOp : public framework::OperatorWithKernel {
auto input_data_type =
framework::ToDataType(ctx.Input<Tensor>("X")->type());
if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used");
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"float16 can only be used on GPU place");
}
std::string data_format = ctx.Attr<std::string>("data_format");
......@@ -70,6 +74,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
library_);
}
};
class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
......
......@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/float16.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
softmax_grad,
ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL(softmax_grad,
ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>);
......@@ -30,19 +30,16 @@ class SplitIdsOpKernel : public framework::OpKernel<T> {
PADDLE_THROW("SplitIds do not support GPU kernel");
}
const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
auto& ids_dims = ids_t->dims();
auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const T* ids = ids_t->data<T>();
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
// split id by their shard_num.
for (size_t i = 0; i < ids_dims[0]; ++i) {
for (int i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
......@@ -6,8 +6,8 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
add_dependencies(profiler_py_proto profiler_py_proto_init)
add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
......
......@@ -27,6 +27,11 @@ DEFINE_double(fraction_of_cpu_memory_to_use, 1,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
DEFINE_double(
fraction_of_cuda_pinned_memory_to_use, 0.5,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc");
namespace paddle {
namespace platform {
......@@ -62,5 +67,22 @@ size_t CpuMaxChunkSize() {
return CpuMaxAllocSize() / 32;
}
size_t CUDAPinnedMaxAllocSize() {
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
}
size_t CUDAPinnedMinChunkSize() {
// Allow to allocate the minimum chunk size is 64 KB.
return 1 << 16;
}
size_t CUDAPinnedMaxChunkSize() {
// Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED
// memory.
return CUDAPinnedMaxAllocSize() / 256;
}
} // namespace platform
} // namespace paddle
......@@ -22,11 +22,20 @@ namespace platform {
//! Get the maximum allocation size for a machine.
size_t CpuMaxAllocSize();
//! Get the maximum allocation size for a machine.
size_t CUDAPinnedMaxAllocSize();
//! Get the minimum chunk size for buddy allocator.
size_t CpuMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CpuMaxChunkSize();
//! Get the minimum chunk size for buddy allocator.
size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
} // namespace platform
} // namespace paddle
......@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/string/printf.h"
#include <ostream>
#include <sstream>
......@@ -20,6 +19,7 @@
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/string/printf.h"
DECLARE_double(fraction_of_cpu_memory_to_use);
......
......@@ -257,9 +257,11 @@ class ScopedConvolutionDescriptor {
}
#endif
cudnnDataType_t compute_type =
(type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
CUDNN_CROSS_CORRELATION, type));
CUDNN_CROSS_CORRELATION, compute_type));
return desc_;
}
......
......@@ -53,6 +53,16 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
"option");
#endif
} else if (platform::is_cuda_pinned_place(p)) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(
p,
PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
#else
PADDLE_THROW(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
"option");
#endif
}
}
......@@ -186,6 +196,20 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudaStream_t CUDADeviceContext::stream() const { return stream_; }
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
eigen_device_.reset(new Eigen::DefaultDevice());
}
CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
: place_(place) {
eigen_device_.reset(new Eigen::DefaultDevice());
}
Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
return eigen_device_.get();
}
Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
#endif
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -118,6 +118,25 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext;
};
// Currently, CUDAPinnedDeviceContext is only used to data copying.
class CUDAPinnedDeviceContext : public DeviceContext {
public:
CUDAPinnedDeviceContext();
explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
Place GetPlace() const override;
Eigen::DefaultDevice* eigen_device() const;
private:
CUDAPinnedPlace place_;
std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
};
template <>
struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
using TYPE = CUDAPinnedDeviceContext;
};
#endif
#ifdef PADDLE_WITH_MKLDNN
......
......@@ -24,6 +24,10 @@ void *cublas_dso_handle = nullptr;
CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cublas_v2.h>
#include <cuda.h>
#include <dlfcn.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
......@@ -39,9 +40,9 @@ extern void *cublas_dso_handle;
template <typename... Args> \
inline cublasStatus_t operator()(Args... args) { \
typedef cublasStatus_t (*cublasFunc)(Args...); \
std::call_once(cublas_dso_flag, \
paddle::platform::dynload::GetCublasDsoHandle, \
&cublas_dso_handle); \
std::call_once(cublas_dso_flag, []() { \
cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
}); \
void *p_##__name = dlsym(cublas_dso_handle, #__name); \
return reinterpret_cast<cublasFunc>(p_##__name)(args...); \
} \
......@@ -70,6 +71,7 @@ extern void *cublas_dso_handle;
__macro(cublasDgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasGemmEx); \
__macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \
......@@ -89,9 +91,15 @@ extern void *cublas_dso_handle;
__macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \
__macro(cublasDgetriBatched)
__macro(cublasDgetriBatched);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
// APIs available after CUDA 9.0
#if CUDA_VERSION >= 9000
#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
#endif
#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
} // namespace dynload
......
......@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
#ifdef PADDLE_USE_DSO
bool HasCUDNN() {
std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
return cudnn_dso_handle != nullptr;
}
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include <cudnn.h>
#include <dlfcn.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
......@@ -35,9 +35,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using cudnn_func = decltype(__name(args...)) (*)(Args...); \
std::call_once(cudnn_dso_flag, \
paddle::platform::dynload::GetCUDNNDsoHandle, \
&cudnn_dso_handle); \
std::call_once(cudnn_dso_flag, []() { \
cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
}); \
EnforceCUDNNLoaded(#__name); \
void* p_##__name = dlsym(cudnn_dso_handle, #__name); \
return reinterpret_cast<cudnn_func>(p_##__name)(args...); \
......@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount);
__macro(cudnnSetConvolutionGroupCount); \
__macro(cudnnSetConvolutionMathType);
CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
......
......@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUPTI
#include <cuda.h>
#include <cupti.h>
#include <dlfcn.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
......@@ -41,9 +42,9 @@ extern void *cupti_dso_handle;
template <typename... Args> \
inline CUptiResult CUPTIAPI operator()(Args... args) { \
typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...); \
std::call_once(cupti_dso_flag, \
paddle::platform::dynload::GetCUPTIDsoHandle, \
&cupti_dso_handle); \
std::call_once(cupti_dso_flag, []() { \
cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
}); \
void *p_##__name = dlsym(cupti_dso_handle, #__name); \
return reinterpret_cast<cuptiFunc>(p_##__name)(args...); \
} \
......
......@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <curand.h>
#include <dlfcn.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
namespace paddle {
......@@ -30,9 +31,9 @@ extern void *curand_dso_handle;
template <typename... Args> \
curandStatus_t operator()(Args... args) { \
typedef curandStatus_t (*curandFunc)(Args...); \
std::call_once(curand_dso_flag, \
paddle::platform::dynload::GetCurandDsoHandle, \
&curand_dso_handle); \
std::call_once(curand_dso_flag, []() { \
curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
}); \
void *p_##__name = dlsym(curand_dso_handle, #__name); \
return reinterpret_cast<curandFunc>(p_##__name)(args...); \
} \
......
......@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include <dlfcn.h>
#include <memory>
#include <mutex>
#include <mutex> // NOLINT
#include <string>
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/fluid/platform/dynload/cupti_lib_path.h"
......@@ -65,22 +67,21 @@ static inline std::string join(const std::string& part1,
return ret;
}
static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
void** dso_handle,
static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
int dynload_flags) {
VLOG(3) << "Try to find library: " << dso_path
<< " from default system path.";
// default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
// bring System Integrity Projection (SIP), if dso_handle
// is null, search from default package path in Mac OS.
#if defined(__APPLE__) || defined(__OSX__)
if (nullptr == *dso_handle) {
dso_path = join("/usr/local/cuda/lib/", dso_path);
*dso_handle = dlopen(dso_path.c_str(), dynload_flags);
if (nullptr == *dso_handle) {
if (nullptr == dso_handle) {
dso_handle =
dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
if (nullptr == dso_handle) {
if (dso_path == "libcudnn.dylib") {
LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
"For instance, sudo tar -xzf "
......@@ -91,28 +92,29 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
}
}
#endif
return dso_handle;
}
static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
const std::string& dso_name,
void** dso_handle,
bool throw_on_error = true) {
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
*dso_handle = nullptr;
void* dso_handle = nullptr;
std::string dlPath = dso_name;
if (search_root.empty()) {
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
} else {
// search xxx.so from custom path
dlPath = join(search_root, dso_name);
*dso_handle = dlopen(dlPath.c_str(), dynload_flags);
dso_handle = dlopen(dlPath.c_str(), dynload_flags);
// if not found, search from default path
if (nullptr == *dso_handle) {
if (nullptr == dso_handle) {
LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
<< dlerror() << ")";
dlPath = dso_name;
GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
}
}
auto error_msg =
......@@ -124,70 +126,71 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
"using the DYLD_LIBRARY_PATH is impossible unless System "
"Integrity Protection (SIP) is disabled.";
if (throw_on_error) {
PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
} else if (nullptr == *dso_handle) {
PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
} else if (nullptr == dso_handle) {
LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
}
return dso_handle;
}
void GetCublasDsoHandle(void** dso_handle) {
void* GetCublasDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
#else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
#endif
}
void GetCUDNNDsoHandle(void** dso_handle) {
void* GetCUDNNDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
false);
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
#else
GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
#endif
}
void GetCUPTIDsoHandle(void** dso_handle) {
void* GetCUPTIDsoHandle() {
std::string cupti_path = cupti_lib_path;
if (!FLAGS_cupti_dir.empty()) {
cupti_path = FLAGS_cupti_dir;
}
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
#else
GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
#endif
}
void GetCurandDsoHandle(void** dso_handle) {
void* GetCurandDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
#else
GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
#endif
}
void GetWarpCTCDsoHandle(void** dso_handle) {
void* GetWarpCTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
#else
GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
#endif
}
void GetLapackDsoHandle(void** dso_handle) {
void* GetLapackDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
#else
GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
#endif
}
void GetNCCLDsoHandle(void** dso_handle) {
void* GetNCCLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
#else
GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
#endif
}
......
......@@ -18,55 +18,13 @@ namespace paddle {
namespace platform {
namespace dynload {
/**
* @brief load the DSO of CUBLAS
*
* @param **dso_handle dso handler
*
*/
void GetCublasDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CUDNN
*
* @param **dso_handle dso handler
*
*/
void GetCUDNNDsoHandle(void** dso_handle);
void GetCUPTIDsoHandle(void** dso_handle);
/**
* @brief load the DSO of CURAND
*
* @param **dso_handle dso handler
*
*/
void GetCurandDsoHandle(void** dso_handle);
/**
* @brief load the DSO of warp-ctc
*
* @param **dso_handle dso handler
*
*/
void GetWarpCTCDsoHandle(void** dso_handle);
/**
* @brief load the DSO of lapack
*
* @param **dso_handle dso handler
*
*/
void GetLapackDsoHandle(void** dso_handle);
/**
* @brief load the DSO of NVIDIA nccl
*
* @param **dso_handle dso handler
*
*/
void GetNCCLDsoHandle(void** dso_handle);
void* GetCublasDsoHandle();
void* GetCUDNNDsoHandle();
void* GetCUPTIDsoHandle();
void* GetCurandDsoHandle();
void* GetWarpCTCDsoHandle();
void* GetLapackDsoHandle();
void* GetNCCLDsoHandle();
} // namespace dynload
} // namespace platform
......
......@@ -25,11 +25,6 @@ void *nccl_dso_handle;
NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
void LoadNCCLDSO() {
platform::call_once(nccl_dso_flag,
[] { GetNCCLDsoHandle(&nccl_dso_handle); });
}
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <dlfcn.h>
#include <nccl.h>
#include <mutex>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/call_once.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
......@@ -28,14 +29,15 @@ extern std::once_flag nccl_dso_flag;
extern void* nccl_dso_handle;
#ifdef PADDLE_USE_DSO
extern void LoadNCCLDSO();
#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using nccl_func = decltype(__name(args...)) (*)(Args...); \
paddle::platform::dynload::LoadNCCLDSO(); \
std::call_once(nccl_dso_flag, []() { \
nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
}); \
void* p_##__name = dlsym(nccl_dso_handle, #__name); \
return reinterpret_cast<nccl_func>(p_##__name)(args...); \
} \
......
......@@ -15,9 +15,10 @@ limitations under the License. */
#pragma once
#include <dlfcn.h>
#include <mutex>
#include "ctc.h"
#include <mutex> // NOLINT
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "warpctc/include/ctc.h"
namespace paddle {
namespace platform {
......@@ -36,9 +37,9 @@ extern void* warpctc_dso_handle;
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using warpctcFunc = decltype(__name(args...)) (*)(Args...); \
std::call_once(warpctc_dso_flag, \
paddle::platform::dynload::GetWarpCTCDsoHandle, \
&warpctc_dso_handle); \
std::call_once(warpctc_dso_flag, []() { \
warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
}); \
void* p_##_name = dlsym(warpctc_dso_handle, #__name); \
return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \
} \
......
......@@ -16,35 +16,35 @@ limitations under the License. */
#include <dlfcn.h> // for dladdr
#include <execinfo.h> // for backtrace
#ifdef __GNUC__
#include <cxxabi.h> // for __cxa_demangle
#endif // __GNUC__
#ifdef PADDLE_WITH_CUDA
#include <cublas_v2.h>
#include <cudnn.h>
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#endif // PADDLE_WITH_CUDA
#include <iomanip>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/to_string.h"
#ifdef __GNUC__
#include <cxxabi.h> // for __cxa_demangle
#endif
#include <glog/logging.h>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cublas.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/curand.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include <cublas_v2.h>
#include <cudnn.h>
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#endif
namespace paddle {
......@@ -185,7 +185,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
}
}
#endif // PADDLE_ONLY_CPU
#endif // PADDLE_WITH_CUDA
template <typename T>
inline void throw_on_error(T e) {
......
......@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GT(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(
......@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GE(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(
......@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GT(1, 2UL);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(
......@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
try {
int* a = nullptr;
PADDLE_ENFORCE_NOT_NULL(a);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include <stdint.h>
#include <limits>
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
......@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
HOSTDEVICE inline explicit operator int8_t() const {
return static_cast<int8_t>(float(*this));
return static_cast<int8_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator uint8_t() const {
return static_cast<uint8_t>(float(*this));
return static_cast<uint8_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator int16_t() const {
return static_cast<int16_t>(float(*this));
return static_cast<int16_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator uint16_t() const {
return static_cast<uint16_t>(float(*this));
return static_cast<uint16_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator int32_t() const {
return static_cast<int32_t>(float(*this));
return static_cast<int32_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator uint32_t() const {
return static_cast<uint32_t>(float(*this));
return static_cast<uint32_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator int64_t() const {
return static_cast<int64_t>(float(*this));
return static_cast<int64_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator uint64_t() const {
return static_cast<uint64_t>(float(*this));
return static_cast<uint64_t>(static_cast<float>(*this));
}
HOSTDEVICE inline explicit operator double() const {
return static_cast<double>(float(*this));
return static_cast<double>(static_cast<float>(*this));
}
private:
......@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hadd(a, b);
#else
float res = float(float16(a)) + float(float16(b));
float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
return half(float16(res));
#endif
}
......@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hsub(a, b);
#else
float res = float(float16(a)) - float(float16(b));
float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
return half(float16(res));
#endif
}
......@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hmul(a, b);
#else
float res = float(float16(a)) * float(float16(b));
float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
return half(float16(res));
#endif
}
......@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
float denom = __half2float(b);
return __float2half(num / denom);
#else
float res = float(float16(a)) / float(float16(b));
float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
return half(float16(res));
#endif
}
......@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hneg(a);
#else
float res = -float(float16(a));
float res = -static_cast<float>(float16(a));
return half(float16(res));
#endif
}
DEVICE inline half& operator+=(half& a, const half& b) {
DEVICE inline half& operator+=(half& a, const half& b) { // NOLINT
a = a + b;
return a;
}
DEVICE inline half& operator-=(half& a, const half& b) {
DEVICE inline half& operator-=(half& a, const half& b) { // NOLINT
a = a - b;
return a;
}
DEVICE inline half& operator*=(half& a, const half& b) {
DEVICE inline half& operator*=(half& a, const half& b) { // NOLINT
a = a * b;
return a;
}
DEVICE inline half& operator/=(half& a, const half& b) {
DEVICE inline half& operator/=(half& a, const half& b) { // NOLINT
a = a / b;
return a;
}
......@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(a, b);
#else
return float(float16(a)) == float(float16(b));
return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
#endif
}
......@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(a, b);
#else
return float(float16(a)) != float(float16(b));
return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
#endif
}
......@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(a, b);
#else
return float(float16(a)) < float(float16(b));
return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
#endif
}
......@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(a, b);
#else
return float(float16(a)) <= float(float16(b));
return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
#endif
}
......@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(a, b);
#else
return float(float16(a)) > float(float16(b));
return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
#endif
}
......@@ -477,7 +478,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(a, b);
#else
return float(float16(a)) >= float(float16(b));
return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
#endif
}
......@@ -489,7 +490,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hadd(half(a), half(b)));
#else
return float16(float(a) + float(b));
return float16(static_cast<float>(a) + static_cast<float>(b));
#endif
}
......@@ -497,7 +498,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hsub(half(a), half(b)));
#else
return float16(float(a) - float(b));
return float16(static_cast<float>(a) - static_cast<float>(b));
#endif
}
......@@ -505,7 +506,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hmul(half(a), half(b)));
#else
return float16(float(a) * float(b));
return float16(static_cast<float>(a) * static_cast<float>(b));
#endif
}
......@@ -516,7 +517,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
float denom = __half2float(half(b));
return float16(num / denom);
#else
return float16(float(a) / float(b));
return float16(static_cast<float>(a) / static_cast<float>(b));
#endif
}
......@@ -530,22 +531,22 @@ HOSTDEVICE inline float16 operator-(const float16& a) {
#endif
}
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = a + b;
return a;
}
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = a - b;
return a;
}
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = a * b;
return a;
}
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = a / b;
return a;
}
......@@ -554,7 +555,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(half(a), half(b));
#else
return float(a) == float(b);
return static_cast<float>(a) == static_cast<float>(b);
#endif
}
......@@ -562,7 +563,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(half(a), half(b));
#else
return float(a) != float(b);
return static_cast<float>(a) != static_cast<float>(b);
#endif
}
......@@ -570,7 +571,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(half(a), half(b));
#else
return float(a) < float(b);
return static_cast<float>(a) < static_cast<float>(b);
#endif
}
......@@ -578,7 +579,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(half(a), half(b));
#else
return float(a) <= float(b);
return static_cast<float>(a) <= static_cast<float>(b);
#endif
}
......@@ -586,7 +587,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(half(a), half(b));
#else
return float(a) > float(b);
return static_cast<float>(a) > static_cast<float>(b);
#endif
}
......@@ -594,7 +595,7 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(half(a), half(b));
#else
return float(a) >= float(b);
return static_cast<float>(a) >= static_cast<float>(b);
#endif
}
......@@ -679,22 +680,22 @@ inline float16 operator-(const float16& a) {
return res;
}
inline float16& operator+=(float16& a, const float16& b) {
inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = a + b;
return a;
}
inline float16& operator-=(float16& a, const float16& b) {
inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = a - b;
return a;
}
inline float16& operator*=(float16& a, const float16& b) {
inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = a * b;
return a;
}
inline float16& operator/=(float16& a, const float16& b) {
inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = a / b;
return a;
}
......@@ -784,19 +785,19 @@ inline bool operator>=(const float16& a, const float16& b) {
// Arithmetic operators for float16, software emulated on other CPU
#else
inline float16 operator+(const float16& a, const float16& b) {
return float16(float(a) + float(b));
return float16(static_cast<float>(a) + static_cast<float>(b));
}
inline float16 operator-(const float16& a, const float16& b) {
return float16(float(a) - float(b));
return float16(static_cast<float>(a) - static_cast<float>(b));
}
inline float16 operator*(const float16& a, const float16& b) {
return float16(float(a) * float(b));
return float16(static_cast<float>(a) * static_cast<float>(b));
}
inline float16 operator/(const float16& a, const float16& b) {
return float16(float(a) / float(b));
return float16(static_cast<float>(a) / static_cast<float>(b));
}
inline float16 operator-(const float16& a) {
......@@ -805,51 +806,57 @@ inline float16 operator-(const float16& a) {
return res;
}
inline float16& operator+=(float16& a, const float16& b) {
a = float16(float(a) + float(b));
inline float16& operator+=(float16& a, const float16& b) { // NOLINT
a = float16(static_cast<float>(a) + static_cast<float>(b));
return a;
}
inline float16& operator-=(float16& a, const float16& b) {
a = float16(float(a) - float(b));
inline float16& operator-=(float16& a, const float16& b) { // NOLINT
a = float16(static_cast<float>(a) - static_cast<float>(b));
return a;
}
inline float16& operator*=(float16& a, const float16& b) {
a = float16(float(a) * float(b));
inline float16& operator*=(float16& a, const float16& b) { // NOLINT
a = float16(static_cast<float>(a) * static_cast<float>(b));
return a;
}
inline float16& operator/=(float16& a, const float16& b) {
a = float16(float(a) / float(b));
inline float16& operator/=(float16& a, const float16& b) { // NOLINT
a = float16(static_cast<float>(a) / static_cast<float>(b));
return a;
}
inline bool operator==(const float16& a, const float16& b) {
return float(a) == float(b);
return static_cast<float>(a) == static_cast<float>(b);
}
inline bool operator!=(const float16& a, const float16& b) {
return float(a) != float(b);
return static_cast<float>(a) != static_cast<float>(b);
}
inline bool operator<(const float16& a, const float16& b) {
return float(a) < float(b);
return static_cast<float>(a) < static_cast<float>(b);
}
inline bool operator<=(const float16& a, const float16& b) {
return float(a) <= float(b);
return static_cast<float>(a) <= static_cast<float>(b);
}
inline bool operator>(const float16& a, const float16& b) {
return float(a) > float(b);
return static_cast<float>(a) > static_cast<float>(b);
}
inline bool operator>=(const float16& a, const float16& b) {
return float(a) >= float(b);
return static_cast<float>(a) >= static_cast<float>(b);
}
#endif
HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
float16 res;
res.x = a;
return res;
}
HOSTDEVICE inline bool(isnan)(const float16& a) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hisnan(half(a));
......@@ -886,28 +893,116 @@ struct is_pod<paddle::platform::float16> {
is_standard_layout<paddle::platform::float16>::value;
};
template <>
struct numeric_limits<paddle::platform::float16> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3;
static const int max_digits10 = 5;
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static paddle::platform::float16(min)() {
return paddle::platform::raw_uint16_to_float16(0x400);
}
static paddle::platform::float16 lowest() {
return paddle::platform::raw_uint16_to_float16(0xfbff);
}
static paddle::platform::float16(max)() {
return paddle::platform::raw_uint16_to_float16(0x7bff);
}
static paddle::platform::float16 epsilon() {
return paddle::platform::raw_uint16_to_float16(0x0800);
}
static paddle::platform::float16 round_error() {
return paddle::platform::float16(0.5);
}
static paddle::platform::float16 infinity() {
return paddle::platform::raw_uint16_to_float16(0x7c00);
}
static paddle::platform::float16 quiet_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 signaling_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7e00);
}
static paddle::platform::float16 denorm_min() {
return paddle::platform::raw_uint16_to_float16(0x1);
}
};
} // namespace std
namespace Eigen {
using float16 = paddle::platform::float16;
template <>
struct NumTraits<float16> : GenericNumTraits<float16> {
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
HOSTDEVICE static inline float16 epsilon() {
return paddle::platform::raw_uint16_to_float16(0x0800);
}
HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
HOSTDEVICE static inline float16 highest() {
return paddle::platform::raw_uint16_to_float16(0x7bff);
}
HOSTDEVICE static inline float16 lowest() {
return paddle::platform::raw_uint16_to_float16(0xfbff);
}
HOSTDEVICE static inline float16 infinity() {
return paddle::platform::raw_uint16_to_float16(0x7c00);
}
HOSTDEVICE static inline float16 quiet_NaN() {
return paddle::platform::raw_uint16_to_float16(0x7c01);
}
};
namespace numext {
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
const paddle::platform::float16& a) {
HOSTDEVICE inline bool(isnan)(const float16& a) {
return (paddle::platform::isnan)(a);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
const paddle::platform::float16& a) {
HOSTDEVICE inline bool(isinf)(const float16& a) {
return (paddle::platform::isinf)(a);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
const paddle::platform::float16& a) {
HOSTDEVICE inline bool(isfinite)(const float16& a) {
return (paddle::platform::isfinite)(a);
}
template <>
HOSTDEVICE inline float16 exp(const float16& a) {
return float16(::expf(static_cast<float>(a)));
}
} // namespace numext
} // namespace Eigen
......@@ -14,8 +14,9 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#include "gflags/gflags.h"
#include <algorithm>
#include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h"
DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
......@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
"cudaSetDevice failed in paddle::platform::SetDeviceId");
}
void GpuMemoryUsage(size_t &available, size_t &total) {
PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
void GpuMemoryUsage(size_t *available, size_t *total) {
PADDLE_ENFORCE(cudaMemGetInfo(available, total),
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
}
......@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(available, total);
GpuMemoryUsage(&available, &total);
// Reserve the rest for page tables, etc.
return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
......@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
size_t total = 0;
size_t available = 0;
GpuMemoryUsage(available, total);
GpuMemoryUsage(&available, &total);
VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
<< total / 1024 / 1024 << "M";
size_t reserving = static_cast<size_t>(0.05 * total);
......
......@@ -23,10 +23,6 @@ limitations under the License. */
namespace paddle {
namespace platform {
//! Environment variable: fraction of GPU memory to use on each device.
const std::string kEnvFractionGpuMemoryToUse =
"PADDLE_FRACTION_GPU_MEMORY_TO_USE";
//! Get the total number of GPU devices in system.
int GetCUDADeviceCount();
......@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
void SetDeviceId(int device_id);
//! Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t &available, size_t &total);
void GpuMemoryUsage(size_t *available, size_t *total);
//! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize();
......
......@@ -26,6 +26,7 @@ class PlacePrinter : public boost::static_visitor<> {
void operator()(const CUDAPlace &p) {
os_ << "CUDAPlace(" << p.device << ")";
}
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
std::ostream &os_;
......@@ -40,12 +41,19 @@ const Place &get_place() { return the_default_place; }
const CUDAPlace default_gpu() { return CUDAPlace(0); }
const CPUPlace default_cpu() { return CPUPlace(); }
const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
bool is_gpu_place(const Place &p) {
return boost::apply_visitor(IsCUDAPlace(), p);
}
bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
bool is_cuda_pinned_place(const Place &p) {
return boost::apply_visitor(IsCUDAPinnedPlace(), p);
}
bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which();
......@@ -53,7 +61,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
bool is_same_place(const Place &p1, const Place &p2) {
if (places_are_same_class(p1, p2)) {
if (is_cpu_place(p1)) {
if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
return true;
} else {
return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
......
......@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <iostream>
#include <vector>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
......@@ -45,12 +46,33 @@ struct CUDAPlace {
int device;
};
struct CUDAPinnedPlace {
CUDAPinnedPlace() {}
// needed for variant equality comparison
inline bool operator==(const CUDAPinnedPlace &) const { return true; }
inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &gpu) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
typedef boost::variant<CUDAPlace, CPUPlace> Place;
struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &cpu) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
using PlaceList = std::vector<Place>;
......@@ -59,9 +81,11 @@ const Place &get_place();
const CUDAPlace default_gpu();
const CPUPlace default_cpu();
const CUDAPinnedPlace default_cuda_pinned();
bool is_gpu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &);
......@@ -95,6 +119,16 @@ struct PlaceVisitorWrapper
#else
PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(
const CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
return visitor_(cuda_pinned);
#else
PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda_pinned");
return typename Visitor::result_type();
#endif
}
};
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
......@@ -15,4 +15,6 @@ if(WITH_PYTHON)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID)
endif(WITH_AMD_GPU)
cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
endif(WITH_PYTHON)
......@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "const_value.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace pybind {
void BindConstValue(pybind11::module& m) {
m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
m.def("kTempVarName", [] { return framework::kTempVarName; });
m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
void BindConstValue(pybind11::module* m) {
m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
m->def("kTempVarName", [] { return framework::kTempVarName; });
m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
}
} // namespace pybind
......
......@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
extern void BindConstValue(pybind11::module& m);
void BindConstValue(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -17,8 +17,8 @@ limitations under the License. */
namespace paddle {
namespace pybind {
void BindException(pybind11::module& m) {
static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
void BindException(pybind11::module* m) {
static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
pybind11::register_exception_translator([](std::exception_ptr p) {
try {
if (p) std::rethrow_exception(p);
......@@ -27,7 +27,8 @@ void BindException(pybind11::module& m) {
}
});
m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
m->def("__unittest_throw_exception__",
[] { PADDLE_THROW("test exception"); });
}
} // namespace pybind
......
......@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include "paddle/fluid/platform/enforce.h"
#include "pybind11/pybind11.h"
namespace paddle {
namespace pybind {
extern void BindException(pybind11::module& m);
void BindException(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -11,10 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/pybind/protobuf.h"
#include <deque>
#include <iostream>
#include <string>
#include <tuple>
#include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
......@@ -95,10 +98,11 @@ struct type_caster<boost::variant<Args...>>
namespace paddle {
namespace pybind {
using namespace paddle::framework; // NOLINT
namespace pd = paddle::framework;
template <typename T>
static py::bytes SerializeMessage(T &self) {
static pybind11::bytes SerializeMessage(
T &self) { // NOLINT due to pybind11 convention.
// Check IsInitialized in Python
std::string retv;
PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
......@@ -107,24 +111,24 @@ static py::bytes SerializeMessage(T &self) {
}
// Bind Methods
void BindProgramDesc(py::module &m) {
py::class_<ProgramDesc>(m, "ProgramDesc", "")
.def(py::init<>())
void BindProgramDesc(pybind11::module *m) {
pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
.def(pybind11::init<>())
.def("__init__",
[](ProgramDesc &self, const ProgramDesc &other) {
new (&self) ProgramDesc(other);
[](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
new (&self) pd::ProgramDesc(other);
})
.def("__init__",
[](ProgramDesc &self, const py::bytes &binary_str) {
[](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
std::string str(binary_str);
new (&self) ProgramDesc(str);
new (&self) pd::ProgramDesc(str);
})
.def("append_block", &ProgramDesc::AppendBlock,
py::return_value_policy::reference)
.def("append_block", &pd::ProgramDesc::AppendBlock,
pybind11::return_value_policy::reference)
.def("append_backward",
[](ProgramDesc &program_desc, const VarDesc &target,
[](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
const std::unordered_set<std::string> &no_grad_vars) {
ParamGradInfoMap param_grad_map =
pd::ParamGradInfoMap param_grad_map =
AppendBackward(program_desc, target, no_grad_vars);
std::unordered_map<
std::string, std::tuple<std::string /* grad_var_name */,
......@@ -138,172 +142,184 @@ void BindProgramDesc(py::module &m) {
}
return retv;
})
.def("block", &ProgramDesc::MutableBlock,
py::return_value_policy::reference)
.def("num_blocks", &ProgramDesc::Size)
.def("serialize_to_string", SerializeMessage<ProgramDesc>)
.def("block", &pd::ProgramDesc::MutableBlock,
pybind11::return_value_policy::reference)
.def("num_blocks", &pd::ProgramDesc::Size)
.def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
.def("parse_from_string",
[](ProgramDesc &program_desc, const std::string &data) {
proto::ProgramDesc *desc = program_desc.Proto();
[](pd::ProgramDesc &program_desc, const std::string &data) {
pd::proto::ProgramDesc *desc = program_desc.Proto();
PADDLE_ENFORCE(desc->ParseFromString(data),
"Fail to parse ProgramDesc from string. This could "
"be a bug of Paddle.");
});
}
void BindBlockDesc(py::module &m) {
py::class_<BlockDesc>(m, "BlockDesc", "")
.def_property_readonly("id", &BlockDesc::ID)
.def_property_readonly("parent", &BlockDesc::Parent)
.def("get_forward_block_idx", &BlockDesc::ForwardBlockID)
.def("set_forward_block_idx", &BlockDesc::SetForwardBlockID)
.def("append_op", &BlockDesc::AppendOp,
py::return_value_policy::reference)
.def("prepend_op", &BlockDesc::PrependOp,
py::return_value_policy::reference)
.def("insert_op", &BlockDesc::InsertOp,
py::return_value_policy::reference)
.def("remove_op", &BlockDesc::RemoveOp)
void BindBlockDesc(pybind11::module *m) {
pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
.def_property_readonly("id", &pd::BlockDesc::ID)
.def_property_readonly("parent", &pd::BlockDesc::Parent)
.def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
.def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
.def("append_op", &pd::BlockDesc::AppendOp,
pybind11::return_value_policy::reference)
.def("prepend_op", &pd::BlockDesc::PrependOp,
pybind11::return_value_policy::reference)
.def("insert_op", &pd::BlockDesc::InsertOp,
pybind11::return_value_policy::reference)
.def("remove_op", &pd::BlockDesc::RemoveOp)
.def("var",
[](BlockDesc &self, py::bytes byte_name) {
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.Var(name);
},
py::return_value_policy::reference)
pybind11::return_value_policy::reference)
.def("has_var",
[](BlockDesc &self, py::bytes byte_name) {
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.HasVar(name);
},
py::return_value_policy::reference)
pybind11::return_value_policy::reference)
.def("rename_var",
[](BlockDesc &self, const py::bytes &byte_name,
const py::bytes &byte_name_new) {
[](pd::BlockDesc &self, const pybind11::bytes &byte_name,
const pybind11::bytes &byte_name_new) {
std::string name = byte_name;
std::string new_name = byte_name_new;
self.RenameVar(name, new_name);
})
.def("has_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.HasVarRecursive(name);
})
.def("find_var",
[](BlockDesc &self, py::bytes byte_name) {
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.FindVar(name);
},
py::return_value_policy::reference)
pybind11::return_value_policy::reference)
.def("find_var_recursive",
[](BlockDesc &self, py::bytes byte_name) {
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.FindVarRecursive(name);
},
py::return_value_policy::reference)
.def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
.def("op_size", &BlockDesc::OpSize)
.def("op", &BlockDesc::Op, py::return_value_policy::reference)
.def("serialize_to_string", SerializeMessage<BlockDesc>);
pybind11::return_value_policy::reference)
.def("remove_var",
[](pd::BlockDesc &self, pybind11::bytes byte_name) {
std::string name = byte_name;
return self.RemoveVar(name);
},
pybind11::return_value_policy::reference)
.def("all_vars", &pd::BlockDesc::AllVars,
pybind11::return_value_policy::reference)
.def("op_size", &pd::BlockDesc::OpSize)
.def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
.def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
}
void BindVarDsec(py::module &m) {
py::class_<VarDesc> var_desc(m, "VarDesc", "");
void BindVarDsec(pybind11::module *m) {
pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
var_desc
.def("name",
[](VarDesc &self) {
py::bytes name = self.Name();
[](pd::VarDesc &self) {
pybind11::bytes name = self.Name();
return name;
},
py::return_value_policy::reference)
.def("set_name", &VarDesc::SetName)
.def("set_shape", &VarDesc::SetShape)
.def("set_shapes", &VarDesc::SetShapes)
.def("set_dtype", &VarDesc::SetDataType)
.def("set_dtypes", &VarDesc::SetDataTypes)
.def("set_capacity", &VarDesc::SetCapacity)
.def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
.def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
.def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
.def("lod_level", &VarDesc::GetLoDLevel)
.def("lod_levels", &VarDesc::GetLoDLevels,
py::return_value_policy::reference)
.def("set_lod_level", &VarDesc::SetLoDLevel)
.def("set_lod_levels", &VarDesc::SetLoDLevels)
.def("type", &VarDesc::GetType)
.def("set_type", &VarDesc::SetType)
.def("serialize_to_string", SerializeMessage<VarDesc>)
.def("persistable", &VarDesc::Persistable)
.def("set_persistable", &VarDesc::SetPersistable);
pybind11::return_value_policy::reference)
.def("set_name", &pd::VarDesc::SetName)
.def("set_shape", &pd::VarDesc::SetShape)
.def("set_shapes", &pd::VarDesc::SetShapes)
.def("set_dtype", &pd::VarDesc::SetDataType)
.def("set_dtypes", &pd::VarDesc::SetDataTypes)
.def("set_capacity", &pd::VarDesc::SetCapacity)
.def("shape", &pd::VarDesc::GetShape,
pybind11::return_value_policy::reference)
.def("shapes", &pd::VarDesc::GetShapes,
pybind11::return_value_policy::reference)
.def("dtype", &pd::VarDesc::GetDataType,
pybind11::return_value_policy::reference)
.def("dtypes", &pd::VarDesc::GetDataTypes,
pybind11::return_value_policy::reference)
.def("lod_level", &pd::VarDesc::GetLoDLevel)
.def("lod_levels", &pd::VarDesc::GetLoDLevels,
pybind11::return_value_policy::reference)
.def("set_lod_level", &pd::VarDesc::SetLoDLevel)
.def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
.def("type", &pd::VarDesc::GetType)
.def("set_type", &pd::VarDesc::SetType)
.def("serialize_to_string", SerializeMessage<pd::VarDesc>)
.def("persistable", &pd::VarDesc::Persistable)
.def("set_persistable", &pd::VarDesc::SetPersistable);
py::enum_<proto::VarType::Type>(var_desc, "VarType", "")
.value("BOOL", proto::VarType::BOOL)
.value("INT16", proto::VarType::INT16)
.value("INT32", proto::VarType::INT32)
.value("INT64", proto::VarType::INT64)
.value("FP16", proto::VarType::FP16)
.value("FP32", proto::VarType::FP32)
.value("FP64", proto::VarType::FP64)
.value("LOD_TENSOR", proto::VarType::LOD_TENSOR)
.value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS)
.value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH)
.value("FETCH_LIST", proto::VarType::FETCH_LIST)
.value("STEP_SCOPES", proto::VarType::STEP_SCOPES)
.value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY)
.value("CHANNEL", proto::VarType::CHANNEL)
.value("PLACE_LIST", proto::VarType::PLACE_LIST)
.value("READER", proto::VarType::READER)
.value("RAW", proto::VarType::RAW);
pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
.value("BOOL", pd::proto::VarType::BOOL)
.value("INT16", pd::proto::VarType::INT16)
.value("INT32", pd::proto::VarType::INT32)
.value("INT64", pd::proto::VarType::INT64)
.value("FP16", pd::proto::VarType::FP16)
.value("FP32", pd::proto::VarType::FP32)
.value("FP64", pd::proto::VarType::FP64)
.value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
.value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
.value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
.value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
.value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
.value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
.value("CHANNEL", pd::proto::VarType::CHANNEL)
.value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
.value("READER", pd::proto::VarType::READER)
.value("RAW", pd::proto::VarType::RAW);
}
void BindOpDesc(py::module &m) {
py::enum_<proto::AttrType>(m, "AttrType", "")
.value("INT", proto::AttrType::INT)
.value("INTS", proto::AttrType::INTS)
.value("FLOAT", proto::AttrType::FLOAT)
.value("FLOATS", proto::AttrType::FLOATS)
.value("STRING", proto::AttrType::STRING)
.value("STRINGS", proto::AttrType::STRINGS)
.value("BOOL", proto::AttrType::BOOLEAN)
.value("BOOLS", proto::AttrType::BOOLEANS)
.value("BLOCK", proto::AttrType::BLOCK);
void BindOpDesc(pybind11::module *m) {
pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
.value("INT", pd::proto::AttrType::INT)
.value("INTS", pd::proto::AttrType::INTS)
.value("FLOAT", pd::proto::AttrType::FLOAT)
.value("FLOATS", pd::proto::AttrType::FLOATS)
.value("STRING", pd::proto::AttrType::STRING)
.value("STRINGS", pd::proto::AttrType::STRINGS)
.value("BOOL", pd::proto::AttrType::BOOLEAN)
.value("BOOLS", pd::proto::AttrType::BOOLEANS)
.value("BLOCK", pd::proto::AttrType::BLOCK);
py::class_<OpDesc> op_desc(m, "OpDesc", "");
pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
op_desc
.def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
py::return_value_policy::reference)
.def("copy_from", &OpDesc::CopyFrom)
.def("type", &OpDesc::Type)
.def("set_type", &OpDesc::SetType)
.def("input", &OpDesc::Input)
.def("input_names", &OpDesc::InputNames)
.def("output", &OpDesc::Output)
.def("output_names", &OpDesc::OutputNames)
.def("set_input", &OpDesc::SetInput)
.def("set_output", &OpDesc::SetOutput)
.def("input_arg_names", &OpDesc::InputArgumentNames)
.def("output_arg_names", &OpDesc::OutputArgumentNames)
.def("rename_input", &OpDesc::RenameInput)
.def("rename_output", &OpDesc::RenameOutput)
.def("has_attr", &OpDesc::HasAttr)
.def("attr_type", &OpDesc::GetAttrType)
.def("attr_names", &OpDesc::AttrNames)
.def("set_attr", &OpDesc::SetAttr)
.def("attr", &OpDesc::GetAttr)
.def("set_block_attr", &OpDesc::SetBlockAttr)
.def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
pybind11::return_value_policy::reference)
.def("copy_from", &pd::OpDesc::CopyFrom)
.def("type", &pd::OpDesc::Type)
.def("set_type", &pd::OpDesc::SetType)
.def("input", &pd::OpDesc::Input)
.def("input_names", &pd::OpDesc::InputNames)
.def("output", &pd::OpDesc::Output)
.def("output_names", &pd::OpDesc::OutputNames)
.def("set_input", &pd::OpDesc::SetInput)
.def("set_output", &pd::OpDesc::SetOutput)
.def("input_arg_names", &pd::OpDesc::InputArgumentNames)
.def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
.def("rename_input", &pd::OpDesc::RenameInput)
.def("rename_output", &pd::OpDesc::RenameOutput)
.def("has_attr", &pd::OpDesc::HasAttr)
.def("attr_type", &pd::OpDesc::GetAttrType)
.def("attr_names", &pd::OpDesc::AttrNames)
.def("set_attr", &pd::OpDesc::SetAttr)
.def("attr", &pd::OpDesc::GetAttr)
.def("set_block_attr", &pd::OpDesc::SetBlockAttr)
.def("set_serialized_attr",
[](OpDesc &self, const std::string &name,
const py::bytes &seriralized) {
[](pd::OpDesc &self, const std::string &name,
const pybind11::bytes &seriralized) {
std::string ser(seriralized);
self.SetAttr(name, ser);
})
.def("block_attr", &OpDesc::GetBlockAttr)
.def("check_attrs", &OpDesc::CheckAttrs)
.def("infer_shape", &OpDesc::InferShape)
.def("infer_var_type", &OpDesc::InferVarType)
.def("serialize_to_string", SerializeMessage<OpDesc>)
.def("block", &OpDesc::Block, py::return_value_policy::reference);
.def("block_attr", &pd::OpDesc::GetBlockAttr)
.def("check_attrs", &pd::OpDesc::CheckAttrs)
.def("infer_shape", &pd::OpDesc::InferShape)
.def("infer_var_type", &pd::OpDesc::InferVarType)
.def("serialize_to_string", SerializeMessage<pd::OpDesc>)
.def("block", &pd::OpDesc::Block,
pybind11::return_value_policy::reference);
}
} // namespace pybind
......
......@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include <fstream>
#include <vector>
#include "paddle/fluid/platform/variant.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
void BindProgramDesc(py::module& m);
void BindBlockDesc(py::module& m);
void BindVarDsec(py::module& m);
void BindOpDesc(py::module& m);
void BindProgramDesc(pybind11::module* m);
void BindBlockDesc(pybind11::module* m);
void BindVarDsec(pybind11::module* m);
void BindOpDesc(pybind11::module* m);
} // namespace pybind
} // namespace paddle
......@@ -11,11 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <algorithm>
#include <map>
#include <mutex> // NOLINT // for call_once
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/pybind/protobuf.h"
#include <mutex> // for call_once
#include <unordered_map>
#include "paddle/fluid/framework/backward.h"
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/executor.h"
......@@ -32,7 +38,6 @@ limitations under the License. */
#include "paddle/fluid/operators/cond_op.h"
#include "paddle/fluid/operators/net_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/const_value.h"
......@@ -69,7 +74,7 @@ PYBIND11_PLUGIN(core) {
// not cause namespace pollution.
using namespace paddle::framework; // NOLINT
BindException(m);
BindException(&m);
py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer(
......@@ -100,6 +105,14 @@ PYBIND11_PLUGIN(core) {
[](Tensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place);
})
.def("alloc_int",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<int>(place);
})
.def("alloc_float",
[](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
self.mutable_data<float>(place);
})
.def("set", PyCPUTensorSetFromArray<float>)
.def("set", PyCPUTensorSetFromArray<int>)
.def("set", PyCPUTensorSetFromArray<double>)
......@@ -113,6 +126,12 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<float>)
.def("set", PyCUDAPinnedTensorSetFromArray<int>)
.def("set", PyCUDAPinnedTensorSetFromArray<double>)
.def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<bool>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
#endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>)
......@@ -317,7 +336,17 @@ All parameter, weight, gradient are variables in Paddle.
#else
return new paddle::platform::CUDADeviceContext(place);
#endif
});
})
.def_static("create",
[](paddle::platform::CUDAPinnedPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW(
"CUDAPinnedPlace is not supported in CPU device.");
#else
return new paddle::platform::CUDAPinnedDeviceContext(place);
#endif
});;
// clang-format on
#ifdef PADDLE_WITH_CUDA
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
......@@ -330,6 +359,10 @@ All parameter, weight, gradient are variables in Paddle.
.def(py::init<>())
.def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
.def(py::init<>())
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
py::class_<platform::Place>(m, "Place")
.def(py::init<>())
.def("set_place",
......@@ -339,6 +372,10 @@ All parameter, weight, gradient are variables in Paddle.
.def("set_place",
[](platform::Place &self, const platform::CUDAPlace &gpu_place) {
self = gpu_place;
})
.def("set_place", [](platform::Place &self,
const platform::CUDAPinnedPlace &cuda_pinned_place) {
self = cuda_pinned_place;
});
py::class_<OperatorBase>(m, "Operator")
......@@ -363,6 +400,11 @@ All parameter, weight, gradient are variables in Paddle.
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CUDAPlace &place) { self.Run(scope, place); })
.def("run",
[](OperatorBase &self, const Scope &scope,
const platform::CUDAPinnedPlace &place) {
self.Run(scope, place);
})
.def("type",
[](const OperatorBase &op) -> std::string { return op.Type(); })
.def("outputs",
......@@ -436,11 +478,11 @@ All parameter, weight, gradient are variables in Paddle.
m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable);
BindProgramDesc(m);
BindBlockDesc(m);
BindVarDsec(m);
BindOpDesc(m);
BindConstValue(m);
BindProgramDesc(&m);
BindBlockDesc(&m);
BindVarDsec(&m);
BindOpDesc(&m);
BindConstValue(&m);
py::class_<framework::LoDRankTable>(m, "LodRankTable")
.def("items", [](framework::LoDRankTable &table) {
......@@ -504,14 +546,14 @@ All parameter, weight, gradient are variables in Paddle.
const std::unordered_set<std::string> &params,
const ProgramDesc &startup_program,
const ProgramDesc &main_program, const std::string &loss_var_name,
Scope *scope) {
Scope *scope, bool allow_op_delay) {
new (&self) ParallelExecutor(num_threads, use_event, places,
params, startup_program, main_program,
loss_var_name, scope);
loss_var_name, scope, allow_op_delay);
})
.def("run", &ParallelExecutor::Run);
BindRecordIOWriter(m);
BindRecordIOWriter(&m);
return m.ptr();
}
} // namespace pybind
......
......@@ -13,13 +13,19 @@
// limitations under the License.
#include "paddle/fluid/pybind/recordio.h"
#include <fstream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/recordio/writer.h"
namespace paddle {
namespace pybind {
namespace {
class RecordIOWriter {
public:
RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
......@@ -49,8 +55,10 @@ class RecordIOWriter {
recordio::Writer writer_;
};
void BindRecordIOWriter(py::module& m) {
py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
} // namespace
void BindRecordIOWriter(py::module* m) {
py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
py::enum_<recordio::Compressor>(writer, "Compressor", "")
.value("Snappy", recordio::Compressor::kSnappy)
.value("NoCompress", recordio::Compressor::kNoCompress);
......
......@@ -21,6 +21,7 @@ namespace py = pybind11;
namespace paddle {
namespace pybind {
extern void BindRecordIOWriter(py::module& m);
void BindRecordIOWriter(py::module* m);
} // namespace pybind
} // namespace paddle
......@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <Python.h>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device_context.h"
......@@ -21,12 +24,8 @@ limitations under the License. */
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
namespace details {
template <bool less, size_t I, typename... ARGS>
......@@ -34,16 +33,16 @@ struct CastToPyBufferImpl;
template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<false, I, ARGS...> {
py::buffer_info operator()(framework::Tensor &tensor) {
pybind11::buffer_info operator()(const framework::Tensor &tensor) {
PADDLE_THROW("This type of tensor cannot be expose to Python");
return py::buffer_info();
return pybind11::buffer_info();
}
};
template <size_t I, typename... ARGS>
struct CastToPyBufferImpl<true, I, ARGS...> {
using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
py::buffer_info operator()(framework::Tensor &tensor) {
pybind11::buffer_info operator()(const framework::Tensor &tensor) {
if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
auto dim_vec = framework::vectorize(tensor.dims());
std::vector<size_t> dims_outside;
......@@ -82,15 +81,15 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
if (std::type_index(typeid(CUR_TYPE)) ==
std::type_index(typeid(platform::float16))) {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
return pybind11::buffer_info(
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
"e", /* np.dtype('e') == np.float16 */
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
} else {
return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
py::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()),
dims_outside, strides);
return pybind11::buffer_info(
dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
pybind11::format_descriptor<CUR_TYPE>::format(),
(size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
}
} else {
constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
......@@ -101,7 +100,7 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
} // namespace details
inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
auto buffer_info =
details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
platform::float16>()(tensor);
......@@ -109,7 +108,7 @@ inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
}
template <typename T>
T TensorGetElement(framework::Tensor &self, size_t offset) {
T TensorGetElement(const framework::Tensor &self, size_t offset) {
if (platform::is_cpu_place(self.place())) {
return self.data<T>()[offset];
} else {
......@@ -121,64 +120,70 @@ T TensorGetElement(framework::Tensor &self, size_t offset) {
// TODO(dzhwinter) : fix the redundent Tensor allocate and free
template <typename T>
void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
if (platform::is_gpu_place(self.place())) {
void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
if (platform::is_gpu_place(self->place())) {
std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
framework::TensorCopy(self, platform::CPUPlace(), dst.get());
framework::TensorCopy(*self, platform::CPUPlace(), dst.get());
dst->data<T>()[offset] = elem;
framework::TensorCopy(*dst.get(), self.place(), &self);
framework::TensorCopy(*dst.get(), self->place(), self);
} else if (platform::is_cpu_place(self.place())) {
self.data<T>()[offset] = elem;
} else if (platform::is_cpu_place(self->place())) {
self->data<T>()[offset] = elem;
}
}
template <typename T>
void PyCPUTensorSetFromArray(
framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
framework::Tensor *self,
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CPUPlace place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
dims.push_back(static_cast<int>(array.shape()[i]));
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place);
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCPUTensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CPUPlace &place) {
framework::Tensor *self,
pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CPUPlace place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
dims.push_back(static_cast<int>(array.shape()[i]));
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#ifdef PADDLE_WITH_CUDA
template <typename T>
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<T, py::array::c_style | py::array::forcecast> array,
paddle::platform::CUDAPlace &place) {
framework::Tensor *self,
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CUDAPlace place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
dims.push_back(static_cast<int>(array.shape()[i]));
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(place);
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
......@@ -188,18 +193,22 @@ void PyCUDATensorSetFromArray(
}
template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCUDATensorSetFromArray(
framework::Tensor &self,
py::array_t<uint16_t, py::array::c_style | py::array::forcecast> array,
paddle::platform::CUDAPlace &place) {
framework::Tensor *self,
pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast>
array,
paddle::platform::CUDAPlace place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back((int)array.shape()[i]);
dims.push_back(static_cast<int>(array.shape()[i]));
}
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<platform::float16>(place);
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx =
......@@ -208,6 +217,43 @@ void PyCUDATensorSetFromArray(
sizeof(uint16_t) * array.size(),
cudaMemcpyHostToDevice, dev_ctx->stream());
}
template <typename T>
void PyCUDAPinnedTensorSetFromArray(
framework::Tensor *self,
pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
array,
const paddle::platform::CUDAPinnedPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
template <>
// This following specialization maps uint16_t in the parameter type to
// platform::float16.
void PyCUDAPinnedTensorSetFromArray(
framework::Tensor *self,
pybind11::array_t<uint16_t,
pybind11::array::c_style | pybind11::array::forcecast>
array,
const paddle::platform::CUDAPinnedPlace &place) {
std::vector<int64_t> dims;
dims.reserve(array.ndim());
for (size_t i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
self->Resize(framework::make_ddim(dims));
auto *dst = self->mutable_data<platform::float16>(place);
std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
}
#endif
} // namespace pybind
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/pybind/tensor_py.h"
#include <iostream>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor.h"
TEST(TensorPy, CastToPyBufferImpl) {
typedef int ElemType;
paddle::framework::Tensor t;
auto d = paddle::framework::make_ddim({1, 2, 3});
int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
for (int i = 0; i < paddle::framework::product(d); ++i) {
p[i] = i;
}
pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
EXPECT_EQ(bi.ndim, static_cast<size_t>(3)); // 3-dimensional as d.
EXPECT_EQ(bi.shape.size(), 3U); // as Dim d.
EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
EXPECT_EQ(bi.strides.size(), 3U); // 3-dimensional as d.
EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
}
......@@ -14,11 +14,13 @@
#include "paddle/fluid/recordio/chunk.h"
#include <algorithm>
#include <memory>
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
#include "snappystream.hpp"
#include "zlib.h"
#include "snappy_stream/include/snappystream.hpp"
#include "zlib/include/zlib.h"
namespace paddle {
namespace recordio {
......@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
* Copy stream in to another stream
*/
static void PipeStream(std::istream& in, std::ostream& os) {
ReadStreamByBuf(
in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
ReadStreamByBuf(in, 0,
[&os](const char* buf, size_t len) { os.write(buf, len); });
}
/**
......@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
crc = static_cast<uint32_t>(crc32(
crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
static_cast<uInt>(len)));
});
return crc;
}
......
......@@ -24,7 +24,7 @@ namespace recordio {
// A Chunk contains the Header and optionally compressed records.
class Chunk {
public:
public:
Chunk() : num_bytes_(0) {}
void Add(const std::string& buf) {
num_bytes_ += buf.size();
......@@ -46,7 +46,7 @@ public:
bool Empty() const { return records_.empty(); }
private:
private:
std::vector<std::string> records_;
// sum of record lengths in bytes.
size_t num_bytes_;
......
......@@ -18,29 +18,27 @@
#include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Chunk, SaveLoad) {
Chunk ch;
paddle::recordio::Chunk ch;
ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4));
std::stringstream ss;
ch.Write(ss, Compressor::kNoCompress);
ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
ss.seekg(0);
ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 10U);
}
TEST(Chunk, Compressor) {
Chunk ch;
paddle::recordio::Chunk ch;
ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4));
std::stringstream ss;
ch.Write(ss, Compressor::kSnappy);
ch.Write(ss, paddle::recordio::Compressor::kSnappy);
std::stringstream ss2;
ch.Write(ss2, Compressor::kNoCompress);
ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data;
ch.Clear();
......
......@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
// Header is the metadata of Chunk
class Header {
public:
public:
Header();
Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
......@@ -51,7 +51,7 @@ public:
Compressor CompressType() const { return compressor_; }
uint32_t CompressSize() const { return compress_size_; }
private:
private:
uint32_t num_records_;
uint32_t checksum_;
Compressor compressor_;
......
......@@ -18,14 +18,12 @@
#include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Recordio, ChunkHead) {
Header hdr(0, 1, Compressor::kGzip, 3);
paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
std::stringstream ss;
hdr.Write(ss);
ss.seekg(0, std::ios::beg);
Header hdr2;
paddle::recordio::Header hdr2;
hdr2.Parse(ss);
EXPECT_TRUE(hdr == hdr2);
}
......@@ -13,10 +13,14 @@
// limitations under the License.
#include "paddle/fluid/recordio/scanner.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace recordio {
Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
: stream_(std::move(stream)) {
Reset();
......
......@@ -16,12 +16,15 @@
#include <fstream>
#include <memory>
#include <string>
#include "paddle/fluid/recordio/chunk.h"
namespace paddle {
namespace recordio {
class Scanner {
public:
public:
explicit Scanner(std::unique_ptr<std::istream>&& stream);
explicit Scanner(const std::string& filename);
......@@ -32,7 +35,7 @@ public:
bool HasNext() const;
private:
private:
std::unique_ptr<std::istream> stream_;
Chunk cur_chunk_;
size_t offset_;
......
......@@ -12,9 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/writer.h"
#include <string>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace recordio {
void Writer::Write(const std::string& record) {
cur_chunk_.Add(record);
if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
......
......@@ -11,16 +11,17 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/recordio/chunk.h"
namespace paddle {
namespace recordio {
class Writer {
public:
Writer(std::ostream* sout,
Compressor compressor,
public:
Writer(std::ostream* sout, Compressor compressor,
size_t max_num_records_in_chunk = 1000)
: stream_(*sout),
max_num_records_in_chunk_(max_num_records_in_chunk),
......@@ -32,7 +33,7 @@ public:
~Writer();
private:
private:
std::ostream& stream_;
size_t max_num_records_in_chunk_;
Chunk cur_chunk_;
......
......@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <sstream>
#include <string>
#include "gtest/gtest.h"
#include "paddle/fluid/recordio/scanner.h"
#include "paddle/fluid/recordio/writer.h"
......
../framework/.clang-format
\ No newline at end of file
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "piece.h"
#include "paddle/fluid/string/piece.h"
#include <string.h>
......
......@@ -71,6 +71,8 @@
#include <iostream>
#include <sstream>
#include <string>
#include "tinyformat/tinyformat.h" // https://github.com/c42f/tinyformat
namespace paddle {
......
......@@ -11,7 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "printf.h"
#include "paddle/fluid/string/printf.h"
#include <string>
......@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
std::string weekday = "Wednesday";
const char* month = "July";
size_t day = 27;
long hour = 14;
int hour = 14;
int min = 44;
EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "to_string.h"
#include "paddle/fluid/string/to_string.h"
#include <gtest/gtest.h>
constexpr char kOutputString[] = "User Defined Output";
......@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
}
TEST(to_string, normal) {
using namespace paddle::string;
using paddle::string::to_string;
ASSERT_EQ("10", to_string(10));
ASSERT_EQ("abc", to_string("abc"));
ASSERT_EQ("1.2", to_string(1.2));
}
TEST(to_string, user_defined) {
using namespace paddle::string;
UserDefinedClass instance;
ASSERT_EQ(kOutputString, to_string(instance));
ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "UpsampleLayer.h"
#include "iostream"
namespace paddle {
REGISTER_LAYER(upsample, UpsampleLayer);
size_t UpsampleLayer::getOutputSize() {
if (upsampleSize_ == 0) {
upsampleSize_ = imgSize_ * scale_ - static_cast<int>(padOutX_);
upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast<int>(padOutY_);
}
return upsampleSize_ * upsampleSizeY_ * channels_;
}
bool UpsampleLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2U);
CHECK_EQ(config_.inputs_size(), 2);
const auto& conf = config_.inputs(0).upsample_conf();
const auto& img_conf = conf.image_conf();
imgSizeY_ =
img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
imgSize_ = img_conf.img_size();
channels_ = img_conf.channels();
CHECK((conf.has_upsample_size()) || (conf.has_scale()))
<< "scale or upsample_size is required.";
if (conf.has_upsample_size()) {
upsampleSize_ = conf.upsample_size();
upsampleSizeY_ = upsampleSize_;
if (conf.has_upsample_size_y()) {
upsampleSizeY_ = conf.upsample_size_y();
}
} else {
if (!conf.has_scale_y()) {
scale_ = scaleY_ = conf.scale_y();
CHECK_GT(static_cast<int>(scale_), 1);
} else {
scale_ = conf.scale();
scaleY_ = conf.scale_y();
}
padOutX_ = conf.pad_out_x();
padOutY_ = conf.pad_out_y();
CHECK(!padOutX_ || scale_ == 2)
<< "Output height padding compensation requires scale_ == 2";
CHECK(!padOutY_ || scaleY_ == 2)
<< "Output width padding compensation requires scaleY_ == 2";
upsampleSize_ = upsampleSizeY_ = 0;
}
return true;
}
void UpsampleLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr input = getInputValue(0);
MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
size_t batchSize = input->getHeight();
size_t outSize = getOutputSize();
CHECK_EQ(input->getWidth(), mask->getWidth());
CHECK_EQ(mask->getHeight(), batchSize);
resetOutput(batchSize, outSize);
MatrixPtr output = getOutputValue();
output->upsampleForward(*input,
*mask,
imgSize_,
imgSizeY_,
channels_,
upsampleSize_,
upsampleSizeY_);
}
void UpsampleLayer::backward(const UpdateCallback& callback) {
MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
MatrixPtr inputGrad = getInputGrad(0);
MatrixPtr outputGrad = getOutputGrad();
inputGrad->upsampleBackward(*outputGrad,
*mask,
imgSize_,
imgSizeY_,
channels_,
upsampleSize_,
upsampleSizeY_);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* This layer transpose the pooling process.
* It takes two input, the first input is the input data, and
* the second is the mask data from the max-pool-with-mask layer.
*
*/
class UpsampleLayer : public Layer {
public:
explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
~UpsampleLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback) override;
size_t getOutputSize();
protected:
size_t scale_, scaleY_;
size_t upsampleSize_, upsampleSizeY_;
size_t padOutX_, padOutY_;
size_t imgSize_, imgSizeY_;
size_t channels_;
};
} // namespace paddle
......@@ -14,6 +14,11 @@ function(gserver_test TARGET)
COMMAND ${TARGET})
endfunction()
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
gserver_test(test_LayerGrad)
gserver_test(test_CRFLayerGrad)
gserver_test(test_CrossEntropyOverBeamGrad)
......@@ -27,15 +32,16 @@ gserver_test(test_BatchNorm)
gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
gserver_test(test_Upsample)
set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/gserver/tests)
function(gserver_test_with_python TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endfunction()
gserver_test_with_python(test_PyDataProvider2)
......@@ -56,7 +62,7 @@ if(WITH_MKLDNN)
LayerGradUtil.cpp)
add_test(NAME test_MKLDNN
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
endif()
############### test_WarpCTCLayer #######################
......@@ -65,7 +71,7 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
test_WarpCTCLayer.cpp)
add_test(NAME test_WarpCTCLayer
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
endif()
if(NOT MOBILE_INFERENCE)
......@@ -83,15 +89,15 @@ if(NOT MOBILE_INFERENCE)
endif()
add_test(NAME test_NetworkCompare
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
############ test_CompareSparse ################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endif()
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/math/MathUtils.h"
#include "paddle/testing/TestUtil.h"
void setPoolConfig(paddle::TestConfig* config,
paddle::PoolConfig* pool,
const string& poolType) {
(*config).biasSize = 0;
(*config).layerConfig.set_type("pool");
(*config).layerConfig.set_num_filters(1);
int kw = 2, kh = 2;
int pw = 0, ph = 0;
int sw = 2, sh = 2;
pool->set_pool_type(poolType);
pool->set_channels(2);
pool->set_size_x(kw);
pool->set_size_y(kh);
pool->set_start(0);
pool->set_padding(pw);
pool->set_padding_y(ph);
pool->set_stride(sw);
pool->set_stride_y(sh);
int ow =
paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
int oh =
paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
pool->set_output_x(ow);
pool->set_output_y(oh);
}
paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
const string& poolType,
bool use_gpu,
real* tempGradData) {
/* prepare maxPoolWithMaskLayer */
paddle::TestConfig config;
config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
paddle::PoolConfig* pool = input->mutable_pool_conf();
pool->set_img_size(8);
pool->set_img_size_y(8);
setPoolConfig(&config, pool, "max-pool-with-mask");
config.layerConfig.set_size(pool->output_x() * pool->output_y() *
pool->channels());
config.layerConfig.set_name("MaxPoolWithMask");
std::vector<paddle::DataLayerPtr> dataLayers;
paddle::LayerMap layerMap;
vector<paddle::Argument> datas;
initDataLayer(config,
&dataLayers,
&datas,
&layerMap,
"MaxPoolWithMask",
1,
false,
use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
FLAGS_use_gpu = use_gpu;
std::vector<paddle::ParameterPtr> parameters;
paddle::LayerPtr maxPoolingWithMaskOutputLayer;
initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
/* prepare the upsample layer */
paddle::LayerConfig upsampleLayerConfig;
upsampleLayerConfig.set_type("upsample");
paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
upsampleLayerConfig.add_inputs();
paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
upsampleConfig->set_scale(2);
paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
imageConfig->set_channels(2);
imageConfig->set_img_size(4);
imageConfig->set_img_size_y(4);
upsampleLayerConfig.set_size(2 * 8 * 8);
upsampleLayerConfig.set_name("upsample");
for (size_t i = 0; i < 2; i++) {
paddle::LayerInputConfig& inputTemp =
*(upsampleLayerConfig.mutable_inputs(i));
inputTemp.set_input_layer_name("MaxPoolWithMask");
}
paddle::LayerPtr upsampleLayer;
paddle::ParameterMap parameterMap;
upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
layerMap[upsampleLayerConfig.name()] = upsampleLayer;
upsampleLayer->init(layerMap, parameterMap);
upsampleLayer->setNeedGradient(true);
upsampleLayer->forward(paddle::PASS_GC);
upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
upsampleLayer->backward();
return upsampleLayer;
}
TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
bool useGpu = false;
paddle::MatrixPtr inputMat;
paddle::MatrixPtr inputGPUMat;
paddle::MatrixPtr tempGradMat;
inputMat = paddle::Matrix::create(1, 128, false, useGpu);
inputMat->randomizeUniform();
tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
tempGradMat->randomizeUniform();
real* tempGradData = tempGradMat->getData();
paddle::LayerPtr upsampleLayerCPU =
doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
#ifdef PADDLE_WITH_CUDA
useGpu = true;
real* data = inputMat->getData();
inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
inputGPUMat->copyFrom(data, 128);
paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
upsampleLayerGPU->getOutput("").value);
paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
upsampleLayerGPU->getPrev(0)->getOutputGrad());
#endif
}
......@@ -1024,6 +1024,66 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
LOG(INFO) << "the diffCnt is " << diffCnt;
}
void GpuMatrix::upsampleForward(Matrix& input,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
CHECK(input.useGpu_ == true) << "Matrix type are not equal";
CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
real* inputData = input.getData();
real* maskData = mask.getData();
real* outData = data_;
size_t batch = input.getHeight();
CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
CHECK_EQ(batch, this->getHeight());
CHECK(width_ == outputH * outputW * channels);
hl_upsample_forward(inputData,
maskData,
batch,
imgSizeH,
imgSizeW,
channels,
outputH,
outputW,
outData);
}
void GpuMatrix::upsampleBackward(Matrix& outputGrad,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
real* outputGradData = outputGrad.getData();
real* maskData = mask.getData();
real* inputGradData = data_;
size_t batch = outputGrad.getHeight();
CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
CHECK_EQ(batch, this->getHeight());
CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
hl_upsample_backward(outputGradData,
maskData,
batch,
imgSizeH,
imgSizeW,
channels,
outputH,
outputW,
inputGradData);
}
void GpuMatrix::maxPoolForward(Matrix& inputMat,
size_t imgSizeH,
size_t imgSizeW,
......@@ -1986,6 +2046,72 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
CHECK_EQ(info, 0);
}
void CpuMatrix::upsampleForward(Matrix& input,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
real* inputData = input.getData();
real* maskData = mask.getData();
real* outData = data_;
size_t inLength = imgSizeH * imgSizeW;
size_t outLength = outputH * outputW;
size_t batch = input.getHeight();
CHECK(inLength == input.getWidth() / channels);
CHECK_EQ(batch, this->getHeight());
CHECK_EQ(channels * outLength, this->getWidth());
for (size_t k = 0; k < batch; k++) {
for (size_t c = 0; c < channels; c++) {
for (size_t i = 0; i < inLength; i++) {
size_t out_index = static_cast<int>(maskData[i]);
if (out_index >= outLength) {
LOG(FATAL) << "upsample index " << out_index << " out of range.";
}
outData[out_index] = inputData[i];
}
inputData += inLength;
maskData += inLength;
outData += outLength;
}
}
}
void CpuMatrix::upsampleBackward(Matrix& outputGrad,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
real* outputGradData = outputGrad.getData();
real* maskData = mask.getData();
real* inputGradData = data_;
size_t inLength = imgSizeH * imgSizeW;
size_t outLength = outputH * outputW;
size_t batch = outputGrad.getHeight();
CHECK(inLength == this->getWidth() / channels);
CHECK_EQ(batch, this->getHeight());
CHECK_EQ(channels * outLength, outputGrad.getWidth());
for (size_t k = 0; k < batch; k++) {
for (size_t c = 0; c < channels; c++) {
for (size_t i = 0; i < inLength; i++) {
size_t out_index = static_cast<int>(maskData[i]);
if (out_index >= outLength) {
LOG(FATAL) << "upsample index " << out_index << " out of range.";
}
inputGradData[i] = outputGradData[out_index];
}
inputGradData += inLength;
maskData += inLength;
outputGradData += outLength;
}
}
}
void CpuMatrix::maxPoolForward(Matrix& inputMat,
size_t imgSizeH,
size_t imgSizeW,
......
......@@ -859,6 +859,26 @@ public:
LOG(FATAL) << "Not implemented";
}
virtual void upsampleForward(Matrix& input,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
LOG(FATAL) << "Not implemeted";
}
virtual void upsampleBackward(Matrix& outputGrad,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW) {
LOG(FATAL) << "Not implemeted";
}
/**
* Pooling forward operation, pick out the largest element
* in the sizeX of value, if the maskMatP is not NULL, it will
......@@ -1420,6 +1440,22 @@ public:
void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
void upsampleForward(Matrix& input,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW);
void upsampleBackward(Matrix& outputGrad,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW);
void maxPoolForward(Matrix& inputMat,
size_t imgSizeH,
size_t imgSizeW,
......@@ -1694,6 +1730,22 @@ public:
MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
void upsampleForward(Matrix& input,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW);
void upsampleBackward(Matrix& outputGrad,
Matrix& mask,
size_t imgSizeH,
size_t imgSizeW,
size_t channels,
size_t outputH,
size_t outputW);
void maxPoolForward(Matrix& inputMat,
size_t imgSizeH,
size_t imgSizeW,
......
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
)
add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
set(PYTHON_PATH
${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/trainer/tests)
function(trainer_test TARGET)
add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
add_test(NAME ${TARGET}
COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endfunction()
trainer_test(test_Compare)
......@@ -22,11 +27,11 @@ if(WITH_PYTHON)
add_test(NAME test_TrainerOnePass
COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
endif()
#################### test_config_parser #########################
add_test(NAME test_config_parser
COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE}
${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
......@@ -2,8 +2,8 @@
file(GLOB UTIL_HEADERS . *.h)
file(GLOB UTIL_SOURCES . *.cpp)
create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
if(APPLE)
file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
......
......@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
get_filename_component(ABS_FIL ${filename} ABSOLUTE)
get_filename_component(FIL_WE ${filename} NAME_WE)
set(CUR_PROTO_GEN_PY
${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
set(PROTO_GEN_PY
${CUR_PROTO_GEN_PY}
${PROTO_GEN_PY})
add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
"-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
DEPENDS ${ABS_FIL} protoc)
endforeach()
......
......@@ -323,6 +323,16 @@ message ClipConfig {
required double max = 2;
}
message UpsampleConfig {
required ImageConfig image_conf = 1;
optional uint32 scale = 2 [ default = 2 ];
optional uint32 scale_y = 3 [ default = 2 ];
optional bool pad_out_x = 4 [ default = false ];
optional bool pad_out_y = 5 [ default = false ];
optional uint32 upsample_size = 6;
optional uint32 upsample_size_y = 7;
}
message ROIPoolConfig {
required uint32 pooled_width = 1;
required uint32 pooled_height = 2;
......@@ -359,6 +369,7 @@ message LayerInputConfig {
optional ClipConfig clip_conf = 18;
optional ScaleSubRegionConfig scale_sub_region_conf = 19;
optional ROIPoolConfig roi_pool_conf = 20;
optional UpsampleConfig upsample_conf = 21;
}
message LayerConfig {
......
......@@ -47,14 +47,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so
DEPENDS paddle_pybind)
add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle
COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
......
......@@ -31,7 +31,7 @@ import regularizer
import average
from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace
from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv,
......@@ -57,6 +57,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'LoDTensor',
'CPUPlace',
'CUDAPlace',
'CUDAPinnedPlace',
'Tensor',
'ParamAttr',
'WeightNormParamAttr',
......
......@@ -17,7 +17,7 @@ import framework
from framework import Program, default_main_program, default_startup_program, Parameter, Variable
import optimizer
from layer_helper import LayerHelper
from distributed_spliter import *
import distributed_splitter as splitter
import math
from . import core
import debuger
......@@ -138,7 +138,7 @@ class DistributeTranspiler:
program=None,
pservers="127.0.0.1:6174",
trainers=1,
split_method=round_robin):
split_method=splitter.round_robin):
"""
Transpile the program to distributed data-parallelism programs.
The main_program will be transformed to use a remote parameter server
......@@ -408,11 +408,7 @@ class DistributeTranspiler:
pserver_vars = pserver_program.global_block().vars
created_var_map = dict()
for _, var in pserver_vars.iteritems():
tmpvar = s_prog.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
tmpvar = s_prog.global_block().clone_variable(var)
created_var_map[var.name] = tmpvar
# 2. rename op outputs
......@@ -708,11 +704,7 @@ class DistributeTranspiler:
varlist = [varlist]
for var in varlist:
program.global_block().create_var(
name=var.name,
persistable=var.persistable,
dtype=var.dtype,
shape=var.shape)
program.global_block().clone_variable(var)
optimize_block.append_op(
type=opt_op.type,
......
......@@ -17,8 +17,10 @@ def hash_name(varlist, pserver_endpoints):
"""
hash variable names to several endpoints.
:param varlist: a list of Variables
:return: a map of pserver endpoint -> varname
Args:
varlist(list): a list of Variables
Returns(dict): a map of pserver endpoint -> varname
"""
def _hash_block(block_str, total):
......@@ -34,9 +36,14 @@ def hash_name(varlist, pserver_endpoints):
def round_robin(varlist, pserver_endpoints):
"""
distribute variables to several endpoints.
Distribute variables to several endpoints.
Args:
varlist(list): a list of variables
pserver_endpoints(list): a list of pserver endpoints
Returns(list[int]): the endpoint for each variable
"""
assert (len(varlist) > len(pserver_endpoints))
assert (len(varlist) >= len(pserver_endpoints))
eplist = []
pserver_idx = 0
......
......@@ -852,7 +852,7 @@ class Block(object):
def sync_with_cpp(self):
"""
Sync with the desc on the c++ end.
Sync from the desc on the c++ end.
This method is used to synchronize the c++ desc instance generated by backward.
"""
......@@ -861,6 +861,11 @@ class Block(object):
if not self.has_var(var.name()):
self.create_var(name=var.name(), desc=var, type=var.type())
# sync variables removed from c++ end
for var in self.vars.keys():
if not self.desc.find_var(var):
self.vars.pop(var)
# sync operators from cpp
ops_in_cpp = []
for op_idx in range(0, self.desc.op_size()):
......@@ -895,6 +900,19 @@ class Block(object):
op = Operator(self, op_desc)
self.ops.append(op)
# sync ops removed from c++ end
if end_index != -1 and end_index < len(self.ops):
ops_in_cpp_index = 0
ops_in_python_index = 0
while ops_in_python_index < len(
self.ops) and ops_in_cpp_index < len(ops_in_cpp):
if self.ops[ops_in_python_index].desc != ops_in_cpp[
ops_in_cpp_index]:
del self.ops[ops_in_python_index]
else:
ops_in_cpp_index += 1
ops_in_python_index += 1
assert len(self.ops) == len(ops_in_cpp)
for index in range(len(self.ops)):
assert self.ops[index].desc == ops_in_cpp[index]
......@@ -942,13 +960,20 @@ class Block(object):
The new variable cloned from 'var' in current block.
"""
assert isinstance(var, Variable)
return self.create_var(
ret_var = None
# make STEP_SCOPES var can be safely cloned.
if var.type == core.VarDesc.VarType.STEP_SCOPES:
ret_var = self.create_var(
name=var.name, persistable=var.persistable, type=var.type)
else:
ret_var = self.create_var(
name=var.name,
shape=var.shape,
dtype=var.dtype,
type=var.type,
lod_level=var.lod_level,
persistable=True)
return ret_var
class Program(object):
......
......@@ -76,6 +76,7 @@ __all__ = [
'reshape',
'lod_reset',
'lrn',
'pad',
]
......@@ -132,6 +133,8 @@ def fc(input,
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units.
act (str, default None): Activation to be applied to the output of this layer.
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
library is installed. Default: False
name (str, default None): The name of this layer.
Returns:
......@@ -152,6 +155,31 @@ def fc(input,
dtype = helper.input_dtype()
mul_results = []
if use_mkldnn:
tmp = helper.create_tmp_variable(dtype)
input_shape = input.shape
param_shape = [
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size]
w = helper.create_parameter(
attr=helper.param_attr,
shape=param_shape,
dtype=dtype,
is_bias=False)
if bias_attr is None or bias_attr is False:
bias_attr = False
else:
bias_attr = True
helper.append_op(
type="fc",
inputs={"Input": input,
"W": w},
outputs={"Out": tmp},
attrs={"use_mkldnn": use_mkldnn,
"bias_attr": bias_attr})
return helper.append_activation(tmp)
else:
for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape
param_shape = [
......@@ -169,19 +197,20 @@ def fc(input,
attrs={
"x_num_col_dims": num_flatten_dims,
"y_num_col_dims": 1,
'use_mkldnn': use_mkldnn
})
mul_results.append(tmp)
# sum
if len(mul_results) == 1:
pre_bias = mul_results[0]
else:
pre_bias = helper.create_tmp_variable(dtype)
helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
type="sum",
inputs={"X": mul_results},
outputs={"Out": pre_bias})
# add bias
pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
pre_activation = helper.append_bias_op(
pre_bias, dim_start=num_flatten_dims)
# add activation
return helper.append_activation(pre_activation)
......@@ -3379,6 +3408,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
Examples:
.. code-block:: python
data = fluid.layers.data(
name='data', shape=[2, 4, 6], dtype='float32')
reshaped = fluid.layers.reshape(
......@@ -3580,3 +3610,62 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"beta": beta})
return lrn_out
def pad(x, paddings, pad_value=0., name=None):
"""
Pads a tensor with a constant value given by :attr:`pad_value`, and the
padded width is specified by :attr:`paddings`.
Specifically, the number of values padded before the contents of :attr:`x`
in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
of values padded after the contents of :attr:`x` in dimension :attr:`i` is
indicated by :attr:`paddings[i+1]`.
See below for an example.
.. code-block:: text
Given:
x = [[1, 2], [3, 4]]
paddings = [0, 1, 1, 2]
pad_value = 0
Return:
out = [[0, 1, 2, 0, 0]
[0, 3, 4, 0, 0]
[0, 0, 0, 0, 0]]
Args:
x (Variable): The input tensor variable.
paddings (list): A list of integers. Its elements specify the padded
width before and after for each dimension in turn.
The length of :attr:paddings must be
:math:`rank(x) \\times 2`.
pad_value (float): The constant value used to pad.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The padded tensor variable.
Examples:
.. code-block:: python
# x is a rank 2 tensor variable.
out = fluid.layers.pad(
x=x, paddings=[0, 1, 1, 2], pad_value=0.)
"""
helper = LayerHelper('pad', input=x, **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='pad',
inputs={'X': x},
outputs={'Out': out},
attrs={'paddings': paddings,
'pad_value': float(pad_value)})
return out
......@@ -21,21 +21,34 @@ __all__ = ['ParallelExecutor']
class ParallelExecutor(object):
def __init__(self, loss_name, use_cuda, num_threads=None):
places = []
def __init__(self,
loss_name,
use_cuda,
num_threads=None,
allow_op_delay=False):
self._places = []
self._act_places = []
if use_cuda:
for i in xrange(core.get_cuda_device_count()):
p = core.Place()
p.set_place(core.CUDAPlace(i))
places.append(p)
self._act_places.append(core.CUDAPlace(i))
p.set_place(self._act_places[-1])
self._places.append(p)
else:
for i in xrange(multiprocessing.cpu_count()):
p = core.Place()
p.set_place(core.CPUPlace())
places.append(p)
self._act_places.append(core.CPUPlace(i))
p.set_place(self._act_places[-1])
self._places.append(p)
assert self._places, "no place for execution"
if num_threads is None:
num_threads = min(len(places) * 2, multiprocessing.cpu_count())
if use_cuda:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
num_threads = len(self._places)
else:
min(len(self._places) * 2, multiprocessing.cpu_count())
startup = framework.default_startup_program()
main = framework.default_main_program()
......@@ -44,7 +57,7 @@ class ParallelExecutor(object):
self.executor = core.ParallelExecutor(
num_threads,
True if use_cuda else False, # use_event
places,
self._places,
set([
p.name for p in main.global_block().iter_parameters()
if not p.stop_gradient
......@@ -52,11 +65,29 @@ class ParallelExecutor(object):
startup.desc,
main.desc,
loss_name,
scope)
scope,
allow_op_delay)
self.scope = scope
def run(self, fetch_list):
def run(self, fetch_list, feed_dict={}):
"""
:param fetch_list: A list of variable names that will be fetched.
:param feed_dict: A dict mapping for feed variable name to LoDTensor
or numpy array.
:return: fetched value list.
"""
if not isinstance(feed_dict, dict):
raise TypeError("feed_dict should be a dict")
feed_tensor_dict = {}
for i, feed_name in enumerate(feed_dict):
feed_tensor = feed_dict[feed_name]
if not isinstance(feed_tensor, core.LoDTensor):
feed_tensor = core.LoDTensor()
feed_tensor.set(feed_dict[feed_name], self._act_places[0])
feed_tensor_dict[feed_name] = feed_tensor
fetch_var_name = '@FETCHED_VAR_NAME@'
self.executor.run(fetch_list, fetch_var_name)
self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
return [arr[i] for i in range(len(arr))]
......@@ -22,13 +22,14 @@ function(py_test_modules TARGET_NAME)
set(multiValueArgs MODULES DEPS ARGS ENVS)
cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS}
COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif()
endfunction()
# test time consuming OPs in a separate process for expliot parallism
list(REMOVE_ITEM TEST_OPS test_parallel_executor)
list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
list(REMOVE_ITEM TEST_OPS test_mul_op)
......@@ -64,6 +65,7 @@ else()
endif(WITH_FAST_BUNDLE_TEST)
# tests with high overhead
py_test_modules(test_parallel_executor MODULES test_parallel_executor)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
py_test_modules(test_mul_op MODULES test_mul_op)
......
......@@ -97,8 +97,11 @@ class TestConv2dOp(OpTest):
}
self.outputs = {'Output': output}
def testcudnn(self):
return core.is_compiled_with_cuda() and self.use_cudnn
def test_check_output(self):
if self.use_cudnn:
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
else:
......@@ -107,7 +110,7 @@ class TestConv2dOp(OpTest):
def test_check_grad(self):
if self.dtype == np.float16:
return
if self.use_cudnn:
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place,
......@@ -121,7 +124,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_filter(self):
if self.dtype == np.float16:
return
if self.use_cudnn:
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['Input'],
......@@ -138,7 +141,7 @@ class TestConv2dOp(OpTest):
def test_check_grad_no_input(self):
if self.dtype == np.float16:
return
if self.use_cudnn:
if self.testcudnn():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place, ['Filter'],
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
def fully_connected_naive(input, weights, bias_data=None):
in_n, in_c, in_h, in_w = input.shape
w_h, w_c = weights.shape
x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
result = None
if not bias_data:
result = np.dot(x_data, w_data)
else:
result = np.dot(x_data, w_data) + bias_data
return result
class MatrixGenerate:
def __init__(self, mb, ic, oc, h, w):
self.input = np.random.random((mb, ic, h, w)).astype("float32")
self.weights = np.random.random((ic * h * w, oc)).astype("float32")
class TestFCMKLDNNOp(OpTest):
def setUp(self):
self.op_type = "fc"
self.use_mkldnn = True
self.with_bias = True
self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
self.attrs = {
'use_mkldnn': self.use_mkldnn,
'with_bias': self.with_bias
}
self.outputs = {
'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
}
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9)
def test_check_grad_no_weight(self):
self.check_grad(
['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W'))
class TestFCMKLDNNOp1(TestFCMKLDNNOp):
def init_op_type(self):
self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
class TestFCMKLDNNOp2(TestFCMKLDNNOp):
def init_op_type(self):
self.matrix = MatrixGenerate(2, 32, 40, 1, 1)
class TestFCMKLDNNOp3(TestFCMKLDNNOp):
def init_op_type(self):
self.matrix = MatrixGenerate(2, 2, 4, 1, 1)
class TestFCMKLDNNOp4(TestFCMKLDNNOp):
def init_op_type(self):
self.with_bias = False
self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
class TestFCMKLDNNOp4(TestFCMKLDNNOp):
def init_op_type(self):
self.with_bias = False
self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
if __name__ == "__main__":
unittest.main()
......@@ -115,18 +115,18 @@ class TestLookupTableWIsSelectedRows(OpTest):
w_array = np.ones((len(rows), row_numel)).astype("float32")
for i in range(len(rows)):
w_array[i] *= i
ids_tensor = w_selected_rows.get_tensor()
ids_tensor.set(w_array, place)
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
# create Out Variable
Out_tensor = scope.var('Out').get_tensor()
out_tensor = scope.var('Out').get_tensor()
# create and run lookup_table operator
lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
lookup_table.run(scope, place)
# get result from Out
result_array = np.array(Out_tensor)
result_array = np.array(out_tensor)
# all(): return True if all elements of the iterable are true (or if the iterable is empty)
for idx, row in enumerate(ids_array):
assert (row[0] == result_array[idx]).all()
......
......@@ -21,7 +21,11 @@ import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
def simple_fc_net():
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
......@@ -42,13 +46,18 @@ def simple_fc_net():
return loss
def fc_with_batchnorm():
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_recordio_file(
filename='./mnist.recordio',
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in xrange(1):
hidden = fluid.layers.fc(
......@@ -135,24 +144,26 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def SE_ResNeXt152(batch_size=4):
def SE_ResNeXt50Small(batch_size=2, use_feed=False):
assert not use_feed, "SE_ResNeXt doesn't support feed yet"
img = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
conv = conv_bn_layer(
input=img, num_filters=64, filter_size=3, stride=2, act='relu')
input=img, num_filters=16, filter_size=3, stride=2, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = conv_bn_layer(
input=conv, num_filters=128, filter_size=3, stride=1, act='relu')
input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
cardinality = 64
cardinality = 32
reduction_ratio = 16
depth = [3, 8, 36, 3]
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
for block in range(len(depth)):
......@@ -184,11 +195,13 @@ class TestParallelExecutorBase(unittest.TestCase):
method,
memory_opt=True,
iter=10,
batch_size=None):
batch_size=None,
allow_op_delay=False,
feed_dict={}):
main = fluid.Program()
startup = fluid.Program()
with fluid.program_guard(main, startup):
loss = method()
loss = method(use_feed=len(feed_dict) > 0)
adam = fluid.optimizer.Adam()
adam.minimize(loss)
if memory_opt:
......@@ -198,13 +211,13 @@ class TestParallelExecutorBase(unittest.TestCase):
if batch_size is not None:
batch_size *= fluid.core.get_cuda_device_count()
begin = time.time()
first_loss, = exe.run([loss.name])
first_loss, = exe.run([loss.name], feed_dict=feed_dict)
first_loss = numpy.array(first_loss)
for i in xrange(iter):
exe.run([])
exe.run([], feed_dict=feed_dict)
last_loss, = exe.run([loss.name])
last_loss, = exe.run([loss.name], feed_dict=feed_dict)
end = time.time()
if batch_size is not None:
......@@ -222,7 +235,7 @@ class TestMNIST(TestParallelExecutorBase):
def setUpClass(cls):
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=32)
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
......@@ -236,9 +249,21 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc(self):
self.check_network_convergence(simple_fc_net)
self.check_network_convergence(simple_fc_net, allow_op_delay=True)
img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
simple_fc_net, feed_dict={"image": img,
"label": label})
def test_batchnorm_fc(self):
self.check_network_convergence(fc_with_batchnorm)
img = numpy.zeros(shape=[32, 784], dtype='float32')
label = numpy.ones(shape=[32, 1], dtype='int64')
self.check_network_convergence(
fc_with_batchnorm, feed_dict={"image": img,
"label": label})
class TestResnet(TestParallelExecutorBase):
......@@ -262,10 +287,10 @@ class TestResnet(TestParallelExecutorBase):
def test_resnet(self):
import functools
batch_size = 4
batch_size = 2
self.check_network_convergence(
functools.partial(
SE_ResNeXt152, batch_size=batch_size),
SE_ResNeXt50Small, batch_size=batch_size),
iter=20,
batch_size=batch_size)
......@@ -394,7 +419,8 @@ def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
import transformer_model
def transformer():
def transformer(use_feed):
assert not use_feed, "transfomer doesn't support feed yet"
return transformer_model.transformer(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
......
......@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
self.attrs = {
'min_sizes': self.min_sizes,
'max_sizes': self.max_sizes,
'aspect_ratios': self.aspect_ratios,
'variances': self.variances,
'flip': self.flip,
......@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
'step_h': self.step_h,
'offset': self.offset
}
if len(self.max_sizes) > 0:
self.attrs['max_sizes'] = self.max_sizes
self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
return
def setUp(self):
self.op_type = "prior_box"
self.set_data()
def set_max_sizes(self):
max_sizes = [5, 10]
self.max_sizes = np.array(max_sizes).astype('float32').tolist()
def init_test_params(self):
self.layer_w = 4
self.layer_h = 4
self.layer_w = 32
self.layer_h = 32
self.image_w = 20
self.image_h = 20
self.image_w = 40
self.image_h = 40
self.step_w = float(self.image_w) / float(self.layer_w)
self.step_h = float(self.image_h) / float(self.layer_h)
......@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
self.min_sizes = [2, 4]
self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
self.max_sizes = [5, 10]
self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
self.set_max_sizes()
self.aspect_ratios = [2.0, 3.0]
self.flip = True
self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
......@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
self.clip = True
self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
if len(self.max_sizes) > 1:
if len(self.max_sizes) > 0:
self.num_priors += len(self.max_sizes)
self.offset = 0.5
......@@ -105,11 +106,15 @@ class TestPriorBoxOp(OpTest):
idx = 0
for s in range(len(self.min_sizes)):
min_size = self.min_sizes[s]
c_w = c_h = min_size / 2.
out_boxes[h, w, idx, :] = [
(c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
]
# rest of priors
for r in range(len(self.real_aspect_ratios)):
ar = self.real_aspect_ratios[r]
c_w = min_size * math.sqrt(ar) / 2
c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
(c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h]
idx += 1
if len(self.max_sizes) > 0:
......@@ -122,18 +127,6 @@ class TestPriorBoxOp(OpTest):
(c_y + c_h) / self.image_h]
idx += 1
# rest of priors
for r in range(len(self.real_aspect_ratios)):
ar = self.real_aspect_ratios[r]
if math.fabs(ar - 1.) < 1e-6:
continue
c_w = min_size * math.sqrt(ar) / 2
c_h = (min_size / math.sqrt(ar)) / 2
out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
(c_y - c_h) / self.image_h,
(c_x + c_w) / self.image_w,
(c_y + c_h) / self.image_h]
idx += 1
# clip the prior's coordidate such that it is within[0, 1]
if self.clip:
out_boxes = np.clip(out_boxes, 0.0, 1.0)
......@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
self.out_var = out_var.astype('float32')
class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
def set_max_sizes(self):
self.max_sizes = []
if __name__ == '__main__':
unittest.main()
......@@ -14,13 +14,14 @@
import unittest
import paddle.fluid.core as core
from paddle.fluid.framework import Program
class TestOpDesc(unittest.TestCase):
def test_op_desc(self):
prog = core.ProgramDesc()
self.assertIsNotNone(prog)
block = prog.block(0)
program_desc = core.ProgramDesc()
self.assertIsNotNone(program_desc)
block = program_desc.block(0)
self.assertIsNotNone(block)
op = block.append_op()
self.assertIsNotNone(op)
......@@ -66,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
self.assertEqual(8, len(op.attr_names()))
op.set_block_attr("block_attr", prog.block(0))
op.set_block_attr("block_attr", program_desc.block(0))
self.assertEqual(0, op.block_attr("block_attr"))
mul_op = block.append_op()
......@@ -87,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
del program_desc
def test_append_block(self):
prog_desc = core.ProgramDesc()
self.assertIsNotNone(prog_desc)
block_root = prog_desc.block(0)
program_desc = core.ProgramDesc()
self.assertIsNotNone(program_desc)
block_root = program_desc.block(0)
self.assertIsNotNone(block_root)
self.assertEqual(block_root.id, 0)
block1 = prog_desc.append_block(block_root)
block2 = prog_desc.append_block(block1)
block1 = program_desc.append_block(block_root)
block2 = program_desc.append_block(block1)
self.assertIsNotNone(block1)
self.assertEqual(block1.id, block2.parent)
self.assertEqual(block_root.id, block1.parent)
block3 = prog_desc.append_block(block_root)
block3 = program_desc.append_block(block_root)
self.assertEqual(block3.parent, block_root.id)
self.assertEqual(prog_desc.block(1).id, 1)
self.assertEqual(4, prog_desc.num_blocks())
self.assertEqual(program_desc.block(1).id, 1)
self.assertEqual(4, program_desc.num_blocks())
class TestVarDesc(unittest.TestCase):
......@@ -161,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
class TestBlockDesc(unittest.TestCase):
def test_add_var(self):
prog = core.ProgramDesc()
self.assertIsNotNone(prog)
block = prog.block(0)
program_desc = core.ProgramDesc()
self.assertIsNotNone(program_desc)
block = program_desc.block(0)
self.assertIsNotNone(block)
var1 = block.var("var1")
var2 = block.var("var2")
......@@ -174,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
self.assertEqual(var2_re, var2)
def test_add_op(self):
prog = core.ProgramDesc()
self.assertIsNotNone(prog)
block = prog.block(0)
program_desc = core.ProgramDesc()
self.assertIsNotNone(program_desc)
block = program_desc.block(0)
self.assertIsNotNone(block)
op1 = block.append_op()
op2 = block.append_op()
......@@ -187,32 +188,46 @@ class TestBlockDesc(unittest.TestCase):
self.assertEqual(all_ops, [op0, op1, op2])
def test_remove_op(self):
prog = core.ProgramDesc()
self.assertIsNotNone(prog)
block = prog.block(0)
program = Program()
program_desc = program.desc
self.assertIsNotNone(program_desc)
block = program_desc.block(0)
self.assertIsNotNone(block)
op0 = block.append_op()
op1 = block.append_op()
op2 = block.append_op()
op0.set_type("test")
op1.set_type("test")
op2.set_type("test")
var0 = block.var("var0")
var1 = block.var("var1")
var2 = block.var("var2")
var3 = block.var("var3")
var4 = block.var("var4")
var5 = block.var("var5")
op0.set_input("X", ["var0"])
op0.set_output("Y", ["var0"])
op1.set_input("X", ["var1", "var2"])
op1.set_output("Y", ["var3", "var4"])
op2.set_input("X", ["var1"])
op2.set_output("Y", ["var4", "var5"])
program.sync_with_cpp()
# remove op1, its input var2 and output var3 will be removed at the same time,
# but its input var1 and output var4 will not be removed since they are used for op2.
block.remove_op(0, 1)
block.remove_op(1, 2)
program.sync_with_cpp()
all_ops = []
for idx in xrange(0, block.op_size()):
all_ops.append(block.op(idx))
self.assertEqual(all_ops, [op2])
self.assertEqual(all_ops, [op0, op2])
all_vars = block.all_vars()
self.assertEqual(set(all_vars), {var1, var4, var5})
self.assertEqual(set(all_vars), {var0, var1, var4, var5})
if __name__ == '__main__':
......
......@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
self.check_with_place(place)
class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
def check_with_place(self, place):
scope = core.Scope()
row_width = 12
# create and initialize Grad Variable
grad_height = 10
grad_rows = [0, 4, 7]
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(grad_height)
grad_selected_rows.set_rows(grad_rows)
grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
grad_array[0, 0] = 2.0
grad_array[2, 8] = 4.0
grad_tensor = grad_selected_rows.get_tensor()
grad_tensor.set(grad_array, place)
# create and initialize Param Variable
# create and initialize W Variable
param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
# init Param
w_selected_rows = scope.var('Param').get_selected_rows()
w_selected_rows.set_height(len(param_rows))
w_selected_rows.set_rows(param_rows)
w_array = np.ones((len(param_rows), row_width)).astype("float32")
for i in range(len(param_rows)):
w_array[i] *= i
w_tensor = w_selected_rows.get_tensor()
w_tensor.set(w_array, place)
w_before_optimize = np.array(w_tensor)
# create and initialize LeraningRate Variable
lr_value = 0.1
lr = scope.var('LearningRate').get_tensor()
lr_array = np.full((1), lr_value).astype("float32")
lr.set(lr_array, place)
# optimize with Python
w_after_optimize = np.copy(w_before_optimize)
for index, id in enumerate(grad_rows):
w_after_optimize[id] = w_before_optimize[
id] - lr_value * grad_array[index]
# create and run sgd operator
sgd_op = Operator(
"sgd",
Param='Param',
Grad='Grad',
ParamOut='Param',
LearningRate='LearningRate')
sgd_op.run(scope, place)
# get and compare result
result_array = np.array(w_tensor)
assert (result_array == w_after_optimize).all()
def test_sparse_parameter_sgd(self):
places = [core.CPUPlace()]
# do not support GPU kernel currently
for place in places:
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
......@@ -68,6 +68,17 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp):
self.use_cudnn = True
class TestSoftmaxFP16Op(TestSoftmaxOp):
def init_kernel_type(self):
self.dtype = np.float16
def test_check_output(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
def init_kernel_type(self):
self.use_cudnn = True
......
......@@ -471,6 +471,7 @@ class Input(Cfg):
maxout=None,
spp=None,
pad=None,
upsample=None,
format=None,
nnz=None,
is_static=None,
......@@ -983,6 +984,13 @@ class Pad(Cfg):
self.add_keys(locals())
@config_class
class Upsample(Cfg):
def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size,
upsample_size_y):
self.add_keys(locals())
@config_class
class Norm(Cfg):
def __init__(self,
......@@ -2380,6 +2388,46 @@ class SpatialPyramidPoolLayer(LayerBase):
self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
@config_layer('upsample')
class UpsampleLayer(LayerBase):
def __init__(self, name, inputs, **xargs):
super(UpsampleLayer, self).__init__(
name, 'upsample', 0, inputs=inputs, **xargs)
input_layer = self.get_input_layer(0)
image_conf = self.config.inputs[0].upsample_conf.image_conf
image_conf.img_size = input_layer.width
image_conf.img_size_y = input_layer.height
image_conf.channels = input_layer.size / (input_layer.width *
input_layer.height)
upsample = self.inputs[0].upsample
output_x = 0
output_y = 0
output_size = 0
if upsample.scale:
self.config.inputs[0].upsample_conf.scale = upsample.scale
self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y
output_x = input_layer.width * upsample.scale
output_y = input_layer.height * upsample.scale_y
self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x
self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y
if upsample.upsample_size:
self.config.inputs[
0].upsample_conf.upsample_size = upsample.upsample_size
self.config.inputs[
0].upsample_conf.upsample_size_y = upsample.upsample_size_y
output_x = upsample.upsample_size
output_y = upsample.upsample_size_y
output_size = image_conf.channels * output_x * output_y
self.set_layer_height_width(output_y, output_x)
self.set_layer_depth(input_layer.depth)
self.set_layer_size(output_size)
@config_layer('pad')
class PadLayer(LayerBase):
def __init__(self, name, inputs, **xargs):
......
......@@ -148,6 +148,7 @@ __all__ = [
'resize_layer',
'sub_seq_layer',
'scale_sub_region_layer',
'upsample_layer',
'factorization_machine',
]
......@@ -166,6 +167,7 @@ class LayerType(object):
SEQUENCE_RESHAPE = 'seqreshape'
POOLING_MAX = 'max'
POOLING_AVG = 'average'
UPSAMPLE_LAYER = 'upsample'
FC_LAYER = 'fc'
COST = 'cost'
COSINE_SIM_VEC = 'cos_vm'
......@@ -3014,6 +3016,83 @@ def img_pool3d_layer(input,
size=l.config.size)
@wrap_name_default("upsample")
@layer_support()
def upsample_layer(input,
name=None,
scale=None,
scale_y=None,
upsample_size=None,
upsample_size_y=None,
pad_out_x=False,
pad_out_y=False,
layer_attr=None):
"""
The DePooling process.
Inputs should be a list of length 2. The first input is a layer,
and the second input should be the MaxWithMaskPoolingLayer
The example usage is:
.. code-block:: python
pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2,
pool_type=paddle.pooling.MaxWithMask())
upsample = paddle.v2.layer.upsample(input=[layer1, pool1])
:param name: The name of this layer. It is optional.
:type name: basestring
:param input: contains an input layer and a MaxWithMaskPoolingLayer
:type input: list | tuple | collections.Sequence
:param scale: outputSize = scale * inputSize
:type scale: int | list | tuple | .
:param scale_y: scale_y will be equal to scale, if it's value is None,
:type scale: int | None.
:param upsample_size: specify the outputSize.
:type upsample_size: int | list | tuple.
:param upsample_size_y: specify the y dimension outputSize.
:type upsample_size_y: int.
:param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2
:type pad_out_x: bool.
:param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2
:type pad_out_y: bool.
:param layer_attr: Extra Layer Attribute.
:type layer_attr: ExtraLayerAttribute
:return: LayerOutput object.
:rtype: LayerOutput
"""
assert (scale is not None) or (upsample_size is not None), \
'scale or upsample_size, there must be one to be designated'
assert len(input) == 2, 'layer input size must be 2'
assert input[1].layer_type == LayerType.POOL_LAYER, \
'the second input should be the MaxPoolWithMaskLayer'
scale_y = scale \
if scale is not None else scale_y
upsample_size_y = upsample_size \
if upsample_size is not None else upsample_size_y
layer_type = LayerType.UPSAMPLE_LAYER
layer = Layer(
name=name,
type=layer_type,
inputs=[
Input(
input[0].name,
upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y,
upsample_size, upsample_size_y)),
Input(input[1].name)
],
**ExtraLayerAttribute.to_kwargs(layer_attr))
sz = layer.config.size
return LayerOutput(name, layer_type=layer_type, parents=input, size=sz)
@wrap_name_default("spp")
@layer_support()
def spp_layer(input,
......
#################### test_config_parser #########################
add_test(NAME layers_test
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
add_test(NAME test_reset_hook
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
add_test(NAME test_layerHelpers
COMMAND
${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
)
......@@ -2,7 +2,6 @@
set -e
cd `dirname $0`
export PYTHONPATH=$PWD/../../../../
protostr=$PWD/protostr
. file_list.sh
......
......@@ -58,7 +58,7 @@ def mkl():
'istaged': ISTAGED,
'with_mkl': '@WITH_MKL@'})
write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
packages=['paddle',
......@@ -107,9 +107,10 @@ package_dir={
# So that package points to other directory.
'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
}
if '${WITH_FLUID_ONLY}'== 'OFF':
package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle'
package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
paddle_rt_lib_dir = 'lib'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册