提交 de13f0eb 编写于 作者: L Luo Tao

Merge branch 'develop' into infer_mkl

......@@ -56,7 +56,7 @@ script:
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change
......
......@@ -244,11 +244,11 @@ function(cc_test TARGET_NAME)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
......@@ -311,8 +311,8 @@ function(nv_test TARGET_NAME)
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(nv_test)
......
......@@ -20,9 +20,8 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty());
}
// Read the next batch of data. (A 'batch' can be only one instance)
// If the next batch doesn't exist, the '*out' will be an empty std::vector.
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Show whether the next bacth exists.
virtual bool HasNext() const = 0;
// Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0;
......
......@@ -59,6 +59,17 @@ After converting:
queue. It will block until the queue has the required number of
tensors.
### Sparse Update
For embedding layers, the gradient may have many rows containing only 0 when training,
if the gradient uses a dense tensor to do parameter optimization,
it could spend unnecessary memory, slow down the calculations and waste
the bandwidth while doing distributed training.
In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
non-zero gradient data. So when we do parameter optimization both locally and remotely,
we only need to send those non-zero rows to the optimizer operators:
<img src="src/sparse_update.png" width="700" />
### Benefits
......@@ -91,6 +102,6 @@ After converting:
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)
### References
### References:
[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
......@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
以EigenTensor为例,做一个介绍
......@@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)
......
......@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
Using EigenTensor as an example:
......@@ -125,7 +125,7 @@ EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.
......
......@@ -2,17 +2,17 @@
Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
# Compile Time
The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
```python
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
......@@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost)
- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
- Every Layer has one or more operators and variables/parameters
- All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
- Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
- Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h)
- Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
- All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
- Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
- Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
- Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
- Optimizer: `fluid.optimizer.SGD`. It does the following
- Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
- Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
......@@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(),
fetch_list=[avg_cost])
```
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
- The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
- The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
- Feeds the data: `feed=feeder.feed(data)`
- Evaluates all the operators
- Fetches the result: `fetch_list=[avg_cost]`
- Other worth looking files:
- Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
- Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
- Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
- Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
- Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
- Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
......@@ -34,15 +34,15 @@ PaddlePaddle可以使用常用的Python包管理工具
:align: center
.. csv-table:: 各个版本最新的whl包
:header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
:header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
.. _pip_dependency:
......
......@@ -37,15 +37,15 @@ If the links below shows up the login form, just click "Log in as guest" to star
:align: center
.. csv-table:: whl package of each version
:header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
:header: "version", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
.. _pip_dependency:
......
开发标准
========
PaddlePaddle遵守如下三个部分的代码和文档规范。
PaddlePaddle使用git做版本管理,docker作为构建和测试环境。代码中包含了Cuda, C++, Python, Shell等多种编程语言。语言规范遵守Google C++ Style, Pep-8, 代码库中包含自动化检查工具做风格检查。代码注释需要遵守Doxygen规范,不满足风格要求的代码会编译失败。关于如何使用git, 构建测试及代码开发, 我们提供了如下指南。
.. toctree::
:maxdepth: 1
contribute_to_paddle_cn.md
PaddlePaddle面向国内外用户,包含了中文和英文两部分的文档。设计文档和issue问题描述都推荐使用英文。对于设计文档,重在问题描述,背景阐述,然后才是解决方案。文档由Sphinx生成,因此代码注释也需要符合Sphinx文档标准。推荐本地使用paddlepaddle.org工具编译生成和预览文档,请参阅如下文档。
.. toctree::
:maxdepth: 1
write_docs_cn.rst
PaddlePaddle V2 使用新增Layer方式定义新的操作。组合基础API可以实现多种复杂Layer, 满足绝大多数应用。如需要定制Layer,请参阅如下文档,欢迎提交patch。
.. toctree::
:maxdepth: 1
new_layer_cn.rst
新手入门
============
如果需要快速了解PaddlePaddle的使用,可以参考以下指南。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
在使用PaddlePaddle构建应用时,需要了解一些基本概念。
这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。
.. toctree::
:maxdepth: 1
concepts/use_concepts_cn.rst
## 安装与编译C-API预测库
### 概述
使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项:
必须配置选项:
- `WITH_C_API`,必须配置为`ON`
推荐配置选项:
- `WITH_PYTHON`,推荐配置为`OFF`
- `WITH_SWIG_PY`,推荐配置为`OFF`
- `WITH_GOLANG`,推荐设置为`OFF`
可选配置选项:
- `WITH_GPU`,可配置为`ON/OFF`
- `WITH_MKL`,可配置为`ON/OFF`
对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
## 安装、编译与链接C-API预测库
### 直接下载安装
从CI系统中下载最新的C-API开发包进行安装,用户可以从下面的表格中找到需要的版本:
<table>
<thead>
<tr>
<th>版本说明</th>
<th>C-API</th>
</tr>
</thead>
<tbody>
<tr>
<td>cpu_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cpu_avx_openblas</td>
<td>暂无</td>
</tr>
<tr>
<td>cpu_noavx_openblas</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda7.5_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn7_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr></tbody></table>
### 从源码编译
用户也可以从 PaddlePaddle 核心代码编译C-API链接库,只需在编译时配制下面这些编译选项:
<table>
<thead>
<tr>
<th>选项</th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>WITH_C_API</td>
<td>ON</td>
</tr>
<tr>
<td>WITH_PYTHON</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_SWIG_PY</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_GOLANG</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_GPU</td>
<td>ON/OFF</td>
</tr>
<tr>
<td>WITH_MKL</td>
<td>ON/OFF</td>
</tr></tbody></table>
建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
......@@ -100,23 +158,19 @@ cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
目前提供三种链接方式:
1. 链接`libpaddle_capi_shared.so` 动态库
- 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意:
1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。
1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。
1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
- 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**
2. 链接静态库 `libpaddle_capi_whole.a`
- 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
1. 需要指定`-Wl,--whole-archive`链接选项。
1. 需要显式地链接 `gflags``glog``libz``protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。
1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`
1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。
3. 链接静态库 `libpaddle_capi_layers.a``libpaddle_capi_engine.a`
- 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
1. 这种链接方式主要用于移动端预测。
1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
1. 链接`libpaddle_capi_shared.so` 动态库(这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**),需注意:
1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。
1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。
1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
2. 链接静态库 `libpaddle_capi_whole.a`,需注意:
1. 需要指定`-Wl,--whole-archive`链接选项。
1. 需要显式地链接 `gflags``glog``libz``protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。
1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`
1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。
3. 链接静态库 `libpaddle_capi_layers.a``libpaddle_capi_engine.a`,需注意:
1. 这种链接方式主要用于移动端预测。
1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
......@@ -71,6 +71,13 @@ paddle.init(
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
```python
trainer = paddle.trainer.SGD(..., is_local=False)
```
参数说明
- is_local: **必选, 默认True**, 是否使用PServer更新参数
## 准备数据集
......
......@@ -73,6 +73,14 @@ Parameter Description
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
```python
trainer = paddle.trainer.SGD(..., is_local=False)
```
Parameter Description
- is_local: **required, default True**, whether update parameters by PServer.
## Prepare Training Dataset
Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
......
......@@ -22,7 +22,7 @@
pooling
========
pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API
pooling 的使用示例如下。
.. code-block:: bash
......@@ -47,7 +47,7 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API
last_seq 和 first_seq
=====================
last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详细见 :ref:`api_v2.layer_last_seq` 配置API
last_seq 的使用示例如下(first_seq 类似)
.. code-block:: bash
......@@ -68,7 +68,7 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详
expand
======
expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API
expand 的使用示例如下。
.. code-block:: bash
......
......@@ -4,7 +4,7 @@
单双层RNN API对比介绍
#####################
本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/reyoung/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
示例1:双层RNN,子序列间无Memory
================================
......@@ -166,11 +166,6 @@
在上面代码中,单层和双层序列的使用和示例2中的示例类似,区别是同时处理了两个输入。而对于双层序列,两个输入的子序列长度也并不相同。但是,我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状,和\ :code:`emb2`\ 的序列形状一致。
示例4:beam_search的生成
========================
TBD
词汇表
======
......
RNN模型
===========
循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例,
说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。
.. toctree::
:maxdepth: 1
rnn_config_cn.rst
Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理,
对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。
.. toctree::
:maxdepth: 1
recurrent_group_cn.md
第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。
.. toctree::
:maxdepth: 1
hierarchical_layer_cn.rst
第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。
.. toctree::
:maxdepth: 1
hrnn_rnn_api_compare_cn.rst
......@@ -135,6 +135,14 @@ OpDesc *BlockDesc::PrependOp() {
return ops_.front().get();
}
OpDesc *BlockDesc::InsertOp(size_t index) {
need_update_ = true;
auto it = ops_.begin() + index;
std::unique_ptr<OpDesc> new_op(new OpDesc(this));
it = ops_.insert(it, std::move(new_op));
return (*it).get();
}
void BlockDesc::RemoveOp(size_t s, size_t e) {
if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
return;
......
......@@ -87,6 +87,8 @@ class BlockDesc {
OpDesc *PrependOp();
OpDesc *InsertOp(size_t index);
void RemoveOp(size_t s, size_t e);
std::vector<OpDesc *> AllOps() const;
......
......@@ -34,6 +34,15 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
: prog_(prog), block_id_(block_id) {}
framework::ProgramDesc prog_;
size_t block_id_;
std::vector<std::unique_ptr<OperatorBase>> ops_;
};
Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
......@@ -85,73 +94,9 @@ static void CheckTensorNANOrInf(const std::string& name,
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail):
// - only runs on the first device (i.e. no interdevice communication)
// - will change to use multiple blocks for RNN op and Cond Op
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
auto& block = pdesc.Block(block_id);
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} // if (create_local_scope)
} // if (create_vars)
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
}
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(2) << "-------------------------------------------------------";
}
auto* ctx = Prepare(pdesc, block_id);
RunPreparedContext(ctx, scope, create_local_scope, create_vars);
delete ctx;
}
// Check whether the block already has feed operators and feed_holder.
......@@ -313,5 +258,81 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
delete copy_program;
}
ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
int block_id) {
auto* ctx = new ExecutorPrepareContext(program, block_id);
PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
auto& block = program.Block(block_id);
for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
}
return ctx;
}
void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope, bool create_vars) {
auto& block = ctx->prog_.Block(ctx->block_id_);
Scope* local_scope = scope;
if (create_vars) {
if (create_local_scope) {
local_scope = &scope->NewScope();
for (auto& var : block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
if (var->Persistable()) {
auto* ptr = scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
} else {
for (auto& var : block.AllVars()) {
auto* ptr = local_scope->Var(var->Name());
CreateTensor(ptr, var->GetType());
VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
<< ptr;
}
} // if (create_local_scope)
} // if (create_vars)
for (auto& op : ctx->ops_) {
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
}
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
VLOG(2) << "-------------------------------------------------------";
}
}
} // namespace framework
} // namespace paddle
......@@ -22,7 +22,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
struct ExecutorPrepareContext;
class Executor {
public:
// TODO(dzhwinter) : Do not rely on this function, it will be removed
......@@ -38,8 +38,8 @@ class Executor {
* ProgramDesc
* Scope
*/
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true);
void Run(const ProgramDesc& prog, Scope* scope, int block_id,
bool create_local_scope = true, bool create_vars = true);
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
......@@ -47,6 +47,13 @@ class Executor {
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
int block_id);
void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
bool create_local_scope = true,
bool create_vars = true);
private:
const platform::Place place_;
};
......
......@@ -74,6 +74,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
platform::SetDeviceId(dev_id);
#endif
}
// profile
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
platform::RecordEvent record_event(Type(), dev_ctx);
RunImpl(scope, place);
}
......@@ -497,9 +500,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto dev_ctx = pool.Get(place);
// profile
platform::RecordEvent record_event(Type(), dev_ctx);
auto* dev_ctx = pool.Get(place);
// check if op[type] has kernel registered.
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
......
......@@ -26,7 +26,6 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty());
}
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
virtual bool HasNext() const = 0;
virtual void ReInit() = 0;
......@@ -52,8 +51,6 @@ class DecoratedReader : public ReaderBase {
PADDLE_ENFORCE_NOT_NULL(reader_);
}
bool HasNext() const override { return reader_->HasNext(); }
void ReInit() override { reader_->ReInit(); }
protected:
......@@ -68,13 +65,25 @@ class ReaderHolder {
ReaderBase* Get() const { return reader_.get(); }
void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
bool HasNext() const { return reader_->HasNext(); }
void ReInit() { reader_->ReInit(); }
void ReadNext(std::vector<LoDTensor>* out) {
PADDLE_ENFORCE_NOT_NULL(reader_);
reader_->ReadNext(out);
}
void ReInit() {
PADDLE_ENFORCE_NOT_NULL(reader_);
reader_->ReInit();
}
DDim shape(size_t idx) const { return reader_->shape(idx); }
std::vector<DDim> shapes() const { return reader_->shapes(); }
DDim shape(size_t idx) const {
PADDLE_ENFORCE_NOT_NULL(reader_);
return reader_->shape(idx);
}
std::vector<DDim> shapes() const {
PADDLE_ENFORCE_NOT_NULL(reader_);
return reader_->shapes();
}
void set_shapes(const std::vector<DDim>& shapes) {
PADDLE_ENFORCE_NOT_NULL(reader_);
reader_->set_shapes(shapes);
}
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> // for unique_ptr
#include <mutex> // for call_once
#include <set>
#include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
......@@ -102,6 +103,18 @@ void Scope::DeleteScope(Scope* scope) {
}
}
void Scope::EraseVars(std::vector<std::string>& var_names) {
std::set<std::string> var_set(var_names.begin(), var_names.end());
for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) {
delete it->second;
it = vars_.erase(it);
} else {
++it;
}
}
}
void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const {
auto origin_it = vars_.find(origin_name);
......
......@@ -51,6 +51,8 @@ class Scope {
/// Create a variable with a scope-unique name.
Variable* Var(std::string* name = nullptr);
void EraseVars(std::vector<std::string>& var_names);
/// Find a variable in the scope or any of its ancestors. Returns
/// nullptr if cannot find.
Variable* FindVar(const std::string& name) const;
......
......@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
......
......@@ -67,10 +67,10 @@ class ThreadPool {
} catch (platform::EnforceNotMet ex) {
return std::unique_ptr<platform::EnforceNotMet>(
new platform::EnforceNotMet(ex));
} catch (...) {
LOG(FATAL)
<< "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet.";
} catch (const std::exception& e) {
LOG(FATAL) << "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet."
<< e.what();
}
return nullptr;
});
......
......@@ -115,11 +115,11 @@ void TestInference(const std::string& dirname,
#endif
}
// Enable the profiler
paddle::platform::EnableProfiler(state);
// 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
// Enable the profiler
paddle::platform::EnableProfiler(state);
{
paddle::platform::RecordEvent record_event(
"init_program",
......@@ -143,6 +143,10 @@ void TestInference(const std::string& dirname,
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
}
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"load_program_profiler.txt");
paddle::platform::ResetProfiler();
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
......@@ -165,6 +169,12 @@ void TestInference(const std::string& dirname,
// 6. Run the inference program
{
// Ignore the profiling results of the first run
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
// Enable the profiler
paddle::platform::EnableProfiler(state);
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event(
......@@ -173,12 +183,13 @@ void TestInference(const std::string& dirname,
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
}
}
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"profiler.txt");
paddle::platform::ResetProfiler();
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(
paddle::platform::EventSortingKey::kDefault,
"run_inference_profiler.txt");
paddle::platform::ResetProfiler();
}
delete scope;
}
......@@ -222,8 +222,6 @@ cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
......@@ -63,13 +63,27 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
}
};
class CastOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CastOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
ops::CastOpProtoMaker);
REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker,
ops::CastOpInferShape, ops::CastOpProtoMaker);
REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
ops::CastOpKernel<CPU, double>,
ops::CastOpKernel<CPU, int>,
......
......@@ -12,58 +12,21 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::convolution_forward;
using mkldnn::convolution_backward_weights;
using mkldnn::convolution_backward_data;
using mkldnn::convolution_direct;
using mkldnn::prop_kind;
using mkldnn::padding_kind;
using mkldnn::stream;
namespace {
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine);
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
} // anonymous namespace
template <typename T>
class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
......@@ -88,7 +51,6 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
......@@ -102,48 +64,69 @@ class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory primitives
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
paddings, mkldnn_engine);
// save p_conv_pd into dev_ctx to be referred in backward path
auto p_conv_pd = conv_pd.get();
std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
auto src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine);
// save conv_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_conv_pd, conv_pd);
// create convolution op primitive
auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
conv_weights_memory, conv_dst_memory);
auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
weights_memory, dst_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{conv_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
// push op to stream and wait MKLDNN until it's executed
std::vector<primitive> pipeline{conv_prim};
stream(stream::kind::eager).submit(pipeline).wait();
private:
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
};
template <typename T>
class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("Input");
......@@ -170,7 +153,6 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
// allocate memory for gradient of input/filter
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
......@@ -184,130 +166,111 @@ class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_diff_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
auto src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto diff_weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto diff_dst_md = platform::MKLDNNMemDesc(
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
// create memory
auto conv_diff_dst_memory =
memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
(void*)output_grad_data);
// Retrieve conv_pd from device context
std::shared_ptr<void> conv_pd;
convolution_forward::primitive_desc* p_conv_pd;
conv_pd = dev_ctx.GetBlob(key_conv_pd);
auto conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
dev_ctx.GetBlob(key_conv_pd));
PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context");
p_conv_pd =
static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
// create backward conv primitive for weights
if (filter_grad) {
// create primitive descriptor
convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
strides, paddings, *conv_pd,
mkldnn_engine);
// create memory
auto conv_diff_weights_memory = memory(
{conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
auto diff_weights_memory = mkldnn::memory(
{diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
// create backward conv primitive for weights
auto conv_bwd_weights_prim = convolution_backward_weights(
conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
conv_diff_weights_memory);
auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
conv_bwd_weights_pd, src_memory, diff_dst_memory,
diff_weights_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_weights_prim};
stream(stream::kind::eager).submit(pipeline).wait();
std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
// create primitive descriptor
convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
strides, paddings, *conv_pd, mkldnn_engine);
// create memory
auto conv_diff_src_memory =
memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
// create backward conv primitive for data
auto conv_bwd_data_prim =
convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
conv_weights_memory, conv_diff_src_memory);
auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_data_prim};
stream(stream::kind::eager).submit(pipeline).wait();
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // Compute()
private:
mkldnn::convolution_backward_weights::primitive_desc
ConvBwdWeightsPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) const {
auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
paddings, paddings, mkldnn::padding_kind::zero);
return mkldnn::convolution_backward_weights::primitive_desc(
conv_bwd_weights_desc, engine, conv_pd);
}
mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) const {
auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
paddings, paddings, mkldnn::padding_kind::zero);
return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
engine, conv_pd);
}
};
namespace {
std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine) {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_weights_desc = convolution_backward_weights::desc(
convolution_direct, src, diff_weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
engine, conv_pd);
}
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_data_desc = convolution_backward_data::desc(
convolution_direct, diff_src, weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
conv_pd);
}
} // anonymous namespace
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvOpMkldnnKernel<float>);
ops::ConvMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvGradOpMkldnnKernel<float>);
ops::ConvMKLDNNGradOpKernel<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace operators {
class DeleteVarOp : public framework::OperatorBase {
public:
DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
dev_ctx.Wait();
auto delete_var_names = Inputs("X");
const_cast<framework::Scope &>(scope).EraseVars(delete_var_names);
}
};
class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
public:
DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input of delete op").AsDuplicable();
AddComment(R"DOC(
Delete Operator.
It should not be configured by users directly.
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp,
paddle::framework::EmptyGradOpMaker,
paddle::operators::DeleteVarOpInfoMaker);
if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
GrpcByteBufferSource::GrpcByteBufferSource() {}
bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
cur_ = -1;
left_ = 0;
ptr_ = nullptr;
byte_count_ = 0;
bool ok = src.Dump(&slices_).ok();
if (!ok) {
slices_.clear();
}
return ok;
}
bool GrpcByteBufferSource::Next(const void** data, int* size) {
// Use loop instead of if in case buffer contained empty slices.
while (left_ == 0) {
// Advance to next slice.
cur_++;
if (cur_ >= slices_.size()) {
return false;
}
const ::grpc::Slice& s = slices_[cur_];
left_ = s.size();
ptr_ = reinterpret_cast<const char*>(s.begin());
}
*data = ptr_;
*size = left_;
byte_count_ += left_;
ptr_ += left_;
left_ = 0;
return true;
}
void GrpcByteBufferSource::BackUp(int count) {
ptr_ -= count;
left_ += count;
byte_count_ -= count;
}
bool GrpcByteBufferSource::Skip(int count) {
const void* data;
int size;
while (Next(&data, &size)) {
if (size >= count) {
BackUp(size - count);
return true;
}
// size < count;
count -= size;
}
// error or we have too large count;
return false;
}
google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_;
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
namespace paddle {
namespace operators {
namespace detail {
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class GrpcByteBufferSource
: public ::google::protobuf::io::ZeroCopyInputStream {
public:
GrpcByteBufferSource();
bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times.
bool Next(const void** data, int* size) override;
void BackUp(int count) override;
bool Skip(int count) override;
::google::protobuf::int64 ByteCount() const override;
private:
std::vector<::grpc::Slice> slices_;
size_t cur_; // Current slice index.
int left_; // Number of bytes in slices_[cur_] left to yield.
const char* ptr_; // Address of next byte in slices_[cur_] to yield.
::google::protobuf::int64 byte_count_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace detail {
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1 << 7)) {
*(ptr++) = v;
} else if (v < (1 << 14)) {
*(ptr++) = v | B;
*(ptr++) = v >> 7;
} else if (v < (1 << 21)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = v >> 14;
} else if (v < (1 << 28)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = v >> 21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = (v >> 21) | B;
*(ptr++) = v >> 28;
}
return reinterpret_cast<char*>(ptr);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B - 1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
class ProtoEncodeHelper {
public:
ProtoEncodeHelper(char* buf, int max_size)
: base_(buf), p_(buf), limit_(base_ + max_size) {}
~ProtoEncodeHelper() {
// Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE(p_, limit_);
}
const char* data() const { return base_; }
size_t size() const { return p_ - base_; }
void WriteUint64(int tag, uint64_t v) {
Encode32(combine(tag, WIRETYPE_VARINT));
Encode64(v);
}
void WriteBool(int tag, bool v) {
Encode32(combine(tag, WIRETYPE_VARINT));
EncodeBool(v);
}
void WriteString(int tag, const std::string& v) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(v.size());
EncodeBytes(v.data(), v.size());
}
void WriteVarlengthBeginning(int tag, uint32_t len) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(len);
}
void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
private:
// Note: this module's behavior must match the protocol buffer wire encoding
// format.
enum {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
static uint32_t combine(uint32_t tag, uint32_t type) {
return ((tag << 3) | type);
}
inline void Encode32(uint32_t v) {
if (v < 128) {
// Fast path for single-byte values. Many of the calls will use a
// constant value for v, so the comparison will get optimized away
// when Encode32 is inlined into the caller.
*p_ = v;
p_++;
} else {
p_ = EncodeVarint32(p_, v);
}
}
void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
void EncodeBool(bool v) {
*p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1
p_++;
}
void EncodeBytes(const char* bytes, int N) {
memcpy(p_, bytes, N);
p_ += N;
}
char* base_;
char* p_;
char* limit_; // Just for CHECKs
};
} // detail
} // operators
} // paddle
......@@ -33,10 +33,34 @@ enum VarType {
}
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2;
bytes serialized = 3;
// bool persistable is not needed for sending.
// tensor info:
Type data_type = 3;
repeated int64 dims = 4;
// lod details:
int64 lod_level = 5;
repeated LodData lod = 6;
// tensor data
bytes serialized = 7;
// selected_rows data
bytes rows = 8;
}
message VoidMessage {}
......@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
namespace paddle {
namespace operators {
......@@ -63,6 +68,233 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
}
}
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) {
using VarMsg = sendrecv::VariableMessage;
sendrecv::VariableMessage request;
std::string header;
request.AppendToString(&header);
// When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent
// parallelism execution, need to know when to free the tensor.
DestroyCallback destroy_callback = [](void* backing) {};
void* buf = malloc(1024);
void* payload = nullptr;
size_t payload_size;
ProtoEncodeHelper e((char*)buf, 1024);
e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
} else if (var->IsType<framework::SelectedRows>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
}
switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR: {
auto tensor = var->Get<framework::LoDTensor>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(tensor.type()));
for (auto& dim : framework::vectorize(tensor.dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
auto lod = tensor.lod(); // std::vector<Vector<size_t>>
if (lod.size() > 0) {
e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
for (auto& each : lod) {
e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
2 + // tag + varintlength of submessage
1 + // kLodDataFieldNumber
each.size());
// auto copied from GPU
for (auto& d : each) {
e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
}
}
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()),
copy_size, gpu_dev_ctx.stream());
ctx.Wait();
destroy_callback = [](void* backing) {
platform::CPUPlace cpu;
memory::Free(cpu, backing);
};
#endif
} else {
payload = tensor.data<void>();
}
payload_size = tensor.memory_size();
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
case framework::proto::VarType_Type_SELECTED_ROWS: {
// TODO(typhoonzero): selectedrows implement should not use unique_ptr
auto* slr = var->GetMutable<framework::SelectedRows>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(slr->value().type()));
for (auto& dim : framework::vectorize(slr->value().dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()),
copy_size, gpu_dev_ctx.stream());
ctx.Wait();
destroy_callback = [](void* backing) {
platform::CPUPlace cpu;
memory::Free(cpu, backing);
};
#endif
} else {
payload = slr->mutable_value()->data<void>();
}
payload_size = tensor->memory_size();
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
default:
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
break;
}
// steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
static_cast<char*>(payload)),
::grpc::Slice::STEAL_REF);
if (framework::ToVarType(var->Type()) ==
framework::proto::VarType_Type_SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2((char*)buf, 128);
// NOTE: rows is of type int64_t
size_t rows_memory_size =
slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
slices[3] = ::grpc::Slice(
grpc_slice_new_with_user_data(
const_cast<void*>(
reinterpret_cast<const void*>(slr->rows().data())),
rows_memory_size,
[](void* backing) {
// TODO(typhoonzero): add unref here, same as above.
},
const_cast<char*>(
reinterpret_cast<const char*>(slr->rows().data()))),
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var) {
sendrecv::VariableMessage meta;
GrpcByteBufferSource source;
source.Init(msg);
::google::protobuf::io::CodedInputStream input(&source);
// do zerocopy parsing
PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
PADDLE_ENFORCE(input.ConsumedEntireMessage());
// dims is needed by both tensor and selectedrows
std::vector<int> vecdims;
for (auto& d : meta.dims()) {
vecdims.push_back(d);
}
framework::DDim dims = framework::make_ddim(vecdims);
if (meta.type() == sendrecv::LOD_TENSOR) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
framework::LoD lod;
for (int i = 0; i < meta.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
v.push_back(meta.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
// How to avoid copying and use the message buffer directly?
// Maybe need to find a way to release all memory except tensor content.
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
} else if (meta.type() == sendrecv::SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
int64_t* rows_data = slr->mutable_rows()->data();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
// copy rows CPU data, GPU data will be copied lazly
memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
meta.rows().size());
}
}
} // namespace detail
} // namespace operators
} // namespace paddle
......@@ -33,6 +33,8 @@ namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
typedef void (*DestroyCallback)(void*);
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg);
......@@ -40,6 +42,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var,
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) {
case sendrecv::VariableMessage::FP32:
return typeid(float); // NOLINT
case sendrecv::VariableMessage::FP64:
return typeid(double); // NOLINT
case sendrecv::VariableMessage::INT32:
return typeid(int); // NOLINT
case sendrecv::VariableMessage::INT64:
return typeid(int64_t); // NOLINT
case sendrecv::VariableMessage::BOOL:
return typeid(bool); // NOLINT
default:
PADDLE_THROW("Not support type %d", type);
}
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace math = paddle::operators::math;
namespace memory = paddle::memory;
void RunSerdeTestTensor(platform::Place place) {
// serialize var to ByteBuffer
framework::Variable var;
auto* tensor = var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
framework::LoD lod;
lod.push_back(framework::Vector<size_t>({1, 3, 8}));
tensor->set_lod(lod);
int tensor_numel = 4 * 8 * 4 * 2;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
tensor->mutable_data<float>(place);
math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 0);
EXPECT_EQ(varmsg.dims()[0], 4);
EXPECT_EQ(varmsg.dims()[1], 8);
EXPECT_EQ(varmsg.dims()[2], 4);
EXPECT_EQ(varmsg.dims()[3], 2);
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
}
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto tensor2 = var2.Get<framework::LoDTensor>();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2.data<float>());
}
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
}
void RunSerdeTestSelectedRows(platform::Place place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// serialize var to ByteBuffer
framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({2, 10}));
tensor->mutable_data<float>(place);
int tensor_numel = 2 * 10;
math::set_constant(ctx, tensor, 32.7);
rows->push_back(3);
rows->push_back(10);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 1);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
const int64_t* rows_data =
reinterpret_cast<const int64_t*>(varmsg.rows().data());
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
}
EXPECT_EQ(rows_data[0], 3);
EXPECT_EQ(rows_data[1], 10);
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto* slr2 = var2.GetMutable<framework::SelectedRows>();
auto* tensor2 = slr2->mutable_value();
auto* rows2 = slr2->mutable_rows();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2->data<float>());
}
const int64_t* rows_data2 = rows2->data();
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
}
EXPECT_EQ(rows_data2[0], 3);
EXPECT_EQ(rows_data2[1], 10);
}
TEST(SelectedRows, CPU) {
platform::CPUPlace place;
RunSerdeTestSelectedRows(place);
}
TEST(SelectedRows, GPU) {
platform::CUDAPlace place;
RunSerdeTestSelectedRows(place);
}
TEST(Tensor, CPU) {
platform::CPUPlace place;
RunSerdeTestTensor(place);
}
TEST(Tensor, GPU) {
platform::CUDAPlace place;
RunSerdeTestTensor(place);
}
\ No newline at end of file
......@@ -273,7 +273,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>& true_pos,
std::map<int, std::vector<std::pair<T, int>>>& false_pos,
const int class_num) const {
constexpr T kEPS = static_cast<T>(1e-6);
const int* pos_count_data = input_pos_count.data<int>();
for (int i = 0; i < class_num; ++i) {
label_pos_count[i] = pos_count_data[i];
......@@ -282,12 +281,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod();
for (size_t i = 0; i < pos_data_lod.size(); ++i) {
for (size_t j = pos_data_lod[0][i]; j < pos_data_lod[0][i + 1]; ++j) {
auto pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2];
int flag = 1;
if (pos_data[j * 2 + 1] < kEPS) flag = 0;
int flag = pos_data[j * 2 + 1];
pos[i].push_back(std::make_pair(score, flag));
}
}
......
......@@ -29,8 +29,11 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
elementwise_add_grad, ops::ElementwiseOpGrad);
REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp,
ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL(
elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
......
......@@ -41,6 +41,16 @@ class ElementwiseOp : public framework::OperatorWithKernel {
}
};
class ElementwiseOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto x_var = op_desc.Input("X")[0];
auto out_var = op_desc.Output("Out")[0];
block->Var(out_var)->SetType(block->Var(x_var)->GetType());
}
};
class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
public:
ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
......
......@@ -15,11 +15,23 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
......@@ -46,6 +58,15 @@ void gemm<platform::CPUDeviceContext, double>(
beta, C, ldc);
}
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const bool transA,
......@@ -68,6 +89,15 @@ void gemm<platform::CPUDeviceContext, double>(
lda, B, ldb, beta, C, ldc);
}
template <>
void matmul<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
PADDLE_THROW("float16 matmul not supported on CPU");
}
template <>
void matmul<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context,
......@@ -126,6 +156,15 @@ void matmul<platform::CPUDeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
PADDLE_THROW("float16 batched_gemm not supported on CPU");
}
#ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <>
......
......@@ -16,11 +16,43 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
......@@ -60,6 +92,31 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, N));
}
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, ldc));
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const bool transA,
......@@ -90,6 +147,35 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, ldc));
}
template <>
void matmul<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, float16>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
matrix_b.data<float16>(), beta, matrix_out->data<float16>());
}
template <>
void matmul<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context,
......@@ -148,6 +234,37 @@ void matmul<platform::CUDADeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
// TODO(kexinzhao): add processing code for compute capability < 53 case
PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
"cublas Hgemm requires GPU compute capability >= 53");
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
}
template <>
void batched_gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
......
......@@ -14,30 +14,41 @@
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h"
TEST(math_function, notrans_mul_trans) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) {
PADDLE_ENFORCE_EQ(size, data.size());
for (size_t i = 0; i < data.size(); ++i) {
in_ptr[i] = paddle::platform::float16(data[i]);
}
}
TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -45,33 +56,76 @@ TEST(math_function, notrans_mul_trans) {
EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14);
EXPECT_EQ(out_ptr[3], 50);
delete gpu_place;
}
TEST(math_function, trans_mul_notrans) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
TEST(math_function, notrans_mul_trans_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>(
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
}
TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -84,45 +138,93 @@ TEST(math_function, trans_mul_notrans) {
EXPECT_EQ(out_ptr[6], 15);
EXPECT_EQ(out_ptr[7], 22);
EXPECT_EQ(out_ptr[8], 29);
delete gpu_place;
}
TEST(math_function, gemm_notrans_cublas) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
TEST(math_function, trans_mul_notrans_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>(
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
}
TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -139,47 +241,110 @@ TEST(math_function, gemm_notrans_cublas) {
EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place;
}
TEST(math_function, gemm_trans_cublas) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
TEST(math_function, gemm_notrans_cublas_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
// b = np.arange(12).reshape(3, 4)[:, 1:]
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}
TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
context.Wait();
TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28);
......@@ -188,27 +353,86 @@ TEST(math_function, gemm_trans_cublas) {
EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place;
}
TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
// fp16 GEMM in cublas requires GPU compute capability >= 53
if (context.GetComputeCapability() < 53) {
return;
}
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}
template <typename T>
void GemvTest(int m, int n, bool trans) {
paddle::framework::Tensor mat_a;
paddle::framework::Tensor vec_b;
paddle::framework::Tensor vec_c;
auto* cpu_place = new paddle::platform::CPUPlace();
T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b;
paddle::framework::Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
using namespace paddle::framework;
using namespace paddle::platform;
Tensor mat_a;
Tensor vec_b;
Tensor vec_c;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
Tensor g_mat_a;
Tensor g_vec_b;
Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
for (int i = 0; i < mat_a.numel(); ++i) {
data_a[i] = static_cast<T>(i);
......@@ -217,16 +441,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i);
}
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a);
paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);
TensorCopy(mat_a, gpu_place, context, &g_mat_a);
TensorCopy(vec_b, gpu_place, context, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c);
paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context,
&vec_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c);
if (!trans) {
for (int i = 0; i < m; ++i) {
......
......@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
namespace paddle {
namespace operators {
......
......@@ -14,19 +14,15 @@ limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <memory>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -41,26 +37,35 @@ USE_CUDA_ONLY_OP(ncclBcast);
namespace f = paddle::framework;
namespace p = paddle::platform;
static std::vector<int> gpu_list;
// test data amount
const f::DDim kDims = {100, 100};
const f::DDim kDims = {20, 20};
// nccl op common tester, init communicator.
class NCCLTester : public ::testing::Test {
public:
virtual void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
for (int i = 0; i < count; ++i) {
gpu_list_.emplace_back(i);
}
paddle::platform::CPUPlace cpu_place;
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
p::CUDAPlace place(i);
dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
dev_ctxs_.emplace_back(new p::CUDADeviceContext(place));
}
NCCLInitOp();
}
virtual void TearDown() override {
for (auto &device_context : dev_ctxs) {
for (auto &device_context : dev_ctxs_) {
delete device_context;
}
}
......@@ -70,36 +75,40 @@ class NCCLTester : public ::testing::Test {
std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
op1->SetType("ncclInit");
op1->SetInput("parallel_scopes", {"p_scopes"});
op1->SetOutput("Communicator", {"comm"});
op1->SetAttr("gpus", {gpu_list});
auto *var = g_scope.Var("comm");
auto *var = g_scope_.Var("comm");
var->GetMutable<p::Communicator>();
auto *scope_var = g_scope_.Var("p_scopes");
auto *p_scopes = scope_var->GetMutable<std::vector<f::Scope *>>();
(*p_scopes).resize(gpu_list_.size());
auto op = f::OpRegistry::CreateOp(*op1);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
op->Run(g_scope_, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
int GetGPUData(int gpu_id) { return gpu_id + 42; }
template <class T>
void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
std::unique_lock<std::mutex> lk(mu);
std::unique_lock<std::mutex> lk(mu_);
const f::OpDesc *op1 = &op_desc;
p::CUDAPlace place(gpu_id);
auto &ctx = dev_ctxs.at(gpu_id);
auto &ctx = dev_ctxs_.at(gpu_id);
auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
if (!send_tensor->numel()) {
send_tensor->Resize(kDims);
send_tensor->mutable_data<T>(kDims, place);
std::vector<T> send_vector(f::product(kDims), gpu_id);
std::vector<T> send_vector(f::product(kDims), GetGPUData(gpu_id));
paddle::framework::TensorFromVector<T>(send_vector, *ctx, send_tensor);
ctx->Wait();
VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
}
......@@ -118,30 +127,14 @@ class NCCLTester : public ::testing::Test {
}
public:
std::vector<p::DeviceContext *> dev_ctxs;
f::Scope g_scope;
std::mutex mu;
std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope_;
std::mutex mu_;
std::vector<int> gpu_list_;
};
// ncclInitOp with desc
TEST(NCCL, ncclInitOp) {
std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
op_desc->SetType("ncclInit");
op_desc->SetOutput("Communicator", {"x1"});
op_desc->SetAttr("gpus", {gpu_list});
f::Scope g_scope;
paddle::platform::CPUPlace cpu_place;
auto *var = g_scope.Var("x1");
var->GetMutable<p::Communicator>();
auto op = f::OpRegistry::CreateOp(*op_desc);
VLOG(1) << "invoke NCCLInitOp.";
op->Run(g_scope, cpu_place);
VLOG(1) << "NCCLInitOp finished.";
}
TEST_F(NCCLTester, ncclInitOp) {}
// ncclAllReduceOp with desc
TEST_F(NCCLTester, ncclAllReduceOp) {
......@@ -155,23 +148,25 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}
// check results
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}
for (size_t i = 0; i < dev_scopes.size(); ++i) {
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[i]);
p::CUDAPlace gpu_place(gpu_list_[i]);
auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
......@@ -180,12 +175,12 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}
}
......@@ -204,22 +199,24 @@ TEST_F(NCCLTester, ncclReduceOp) {
std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}
// check results on
float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
float expected_result = 0.0;
for (int gpu_id : gpu_list_) {
expected_result = expected_result + GetGPUData(gpu_id);
}
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[kRoot]);
p::CUDAPlace gpu_place(gpu_list_[kRoot]);
auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
......@@ -229,12 +226,12 @@ TEST_F(NCCLTester, ncclReduceOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
ASSERT_NEAR(ct[j], expected_result, 1e-5);
}
}
......@@ -252,23 +249,22 @@ TEST_F(NCCLTester, ncclBcastOp) {
std::vector<std::thread> ths;
for (size_t i = 0; i < gpu_list.size(); ++i) {
dev_scopes.emplace_back(&g_scope.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
for (size_t i = 0; i < gpu_list_.size(); ++i) {
dev_scopes.emplace_back(&g_scope_.NewScope());
std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list_[i],
*op2.get(), dev_scopes[i]);
ths.emplace_back(std::move(th));
}
for (size_t i = 0; i < gpu_list.size(); ++i) {
for (size_t i = 0; i < gpu_list_.size(); ++i) {
ths[i].join();
}
const int idx = 1;
// check results on
float result = kRoot;
float result = GetGPUData(kRoot);
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list[idx]);
p::CUDAPlace gpu_place(gpu_list_[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
......@@ -277,42 +273,11 @@ TEST_F(NCCLTester, ncclBcastOp) {
auto *ct = result_tensor->mutable_data<float>(cpu_place);
paddle::memory::Copy(
cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
recv_tensor.numel() * sizeof(float),
static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
for (int64_t j = 0; j < f::product(kDims); ++j) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}
int main(int argc, char **argv) {
// FIXME(tonyyang-svail):
// Due to the driver issue on our CI, disable for now
return 0;
const int dev_count = p::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
gpu_list.emplace_back(i);
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
// device context should be release before scope.
// otherwise driver will down.
return RUN_ALL_TESTS();
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pool_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
template <typename T>
class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out");
// Get an unique name from "argument" name of "Out" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Out");
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
if (ctx.Attr<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(input->dims()[i + 2]);
}
}
// Only 2D pooling is supported now
PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling");
PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
"pooling_type must be 'max' or 'avg'");
PADDLE_ENFORCE(input->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW");
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
pooling_type, mkldnn_engine);
// save pool_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_pool_pd, pool_pd);
std::shared_ptr<mkldnn::memory> workspace_memory =
CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
// save pool_workspace_memory to be referred in backward path
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{pool_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
private:
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
const std::vector<int>& stride, const std::vector<int>& padding,
const std::vector<int>& kernel, const std::string& pooling_type,
const mkldnn::engine& engine) const {
auto pool_desc = mkldnn::pooling_forward::desc(
mkldnn::prop_kind::forward,
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
auto p_pool_pd =
new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
return std::unique_ptr<mkldnn::pooling_forward::primitive_desc>(p_pool_pd);
}
std::unique_ptr<mkldnn::memory> CreateWorkspaceMemory(
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd,
const std::string& pooling_type, const mkldnn::engine& engine) const {
mkldnn::memory::primitive_desc workspace_md =
pooling_type == "max"
? pool_pd->workspace_primitive_desc()
: mkldnn::memory::primitive_desc(
{{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
engine);
auto p_workspace_memory = new mkldnn::memory(workspace_md);
return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
}
};
template <typename T>
class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const Tensor* in_x = ctx.Input<Tensor>("X");
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
// Get an unique name from "argument" name of "Out" variable
// This name will be used as key when referring info from device context
const std::string key = ctx.op().Input("Out");
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
if (ctx.Attr<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
}
}
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine();
const T* out_grad_data = out_grad->data<T>();
T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> diff_src_tz =
paddle::framework::vectorize2int(in_x_grad->dims());
std::vector<int> diff_dst_tz =
paddle::framework::vectorize2int(out_grad->dims());
auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// Retrieve pool_pd/pool_workspace_memory from device context
auto pool_pd =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx.GetBlob(key_pool_pd));
PADDLE_ENFORCE(pool_pd != nullptr,
"Fail to find pool_pd in device context");
auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(key_pool_workspace_memory));
PADDLE_ENFORCE(workspace_memory != nullptr,
"Fail to find workspace_memory in device context");
auto pool_bwd_desc = mkldnn::pooling_backward::desc(
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
mkldnn::padding_kind::zero);
auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{bwd_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
} // Compute()
};
} // namespace operators
} // namespace paddle
REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNGradOpKernel<float>);
......@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pool_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -76,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -107,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -181,6 +183,9 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -276,6 +281,9 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......
......@@ -111,7 +111,8 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
});
AddAttr<std::vector<float>>(
"max_sizes",
"(vector<float>) List of max sizes of generated prior boxes.");
"(vector<float>) List of max sizes of generated prior boxes.")
.SetDefault(std::vector<float>{});
AddAttr<std::vector<float>>(
"aspect_ratios",
"(vector<float>) List of aspect ratios of generated prior boxes.");
......
......@@ -97,9 +97,6 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
boxes->mutable_data<T>(ctx.GetPlace());
vars->mutable_data<T>(ctx.GetPlace());
T inv_img_width = 1.0 / img_width;
T inv_img_height = 1.0 / img_height;
auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
for (int h = 0; h < feature_height; ++h) {
for (int w = 0; w < feature_width; ++w) {
......@@ -110,36 +107,30 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
for (size_t s = 0; s < min_sizes.size(); ++s) {
auto min_size = min_sizes[s];
// first prior: aspect_ratio = 1, size = min_size
box_width = box_height = min_size;
box_width = box_height = min_size / 2.;
// xmin
e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
if (max_sizes.size() > 0) {
auto max_size = max_sizes[s];
// second prior: aspect_ratio = 1,
// size = sqrt(min_size * max_size)
box_width = box_height = sqrt(min_size * max_size);
box_width = box_height = sqrt(min_size * max_size) / 2.;
// xmin
e_boxes(h, w, idx, 0) =
(center_x - box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) =
(center_x + box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
......@@ -149,20 +140,16 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
if (fabs(ar - 1.) < 1e-6) {
continue;
}
box_width = min_size * sqrt(ar);
box_height = min_size / sqrt(ar);
box_width = min_size * sqrt(ar) / 2.;
box_height = min_size / sqrt(ar) / 2.;
// xmin
e_boxes(h, w, idx, 0) =
(center_x - box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
// ymin
e_boxes(h, w, idx, 1) =
(center_y - box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
// xmax
e_boxes(h, w, idx, 2) =
(center_x + box_width * 0.5) * inv_img_width;
e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
// ymax
e_boxes(h, w, idx, 3) =
(center_y + box_height * 0.5) * inv_img_height;
e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
idx++;
}
}
......
......@@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {
framework::ReaderHolder* reader =
scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
if (!reader->HasNext()) {
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
if (ins.empty()) {
reader->ReInit();
reader->ReadNext(&ins);
PADDLE_ENFORCE(
reader->HasNext(),
!ins.empty(),
"Reader can not read the next data even it has been re-initialized.");
}
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) {
auto* out =
......
......@@ -2,4 +2,5 @@ cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_regist
op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry)
op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry)
set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op PARENT_SCOPE)
op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry)
set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE)
......@@ -68,10 +68,10 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
buffer_.clear();
buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
if (buffer_.back().empty()) {
buffer_.pop_back();
break;
}
}
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
static constexpr size_t kDoubleBufferSize = 2;
class DoubleBufferReader : public framework::DecoratedReader {
public:
explicit DoubleBufferReader(ReaderBase* reader)
: DecoratedReader(reader),
buffer_(framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize)) {
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
prefetch.detach();
}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override;
~DoubleBufferReader() { buffer_->Close(); }
private:
void PrefetchThreadFunc();
framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
};
class CreateDoubleBufferReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new DoubleBufferReader(underlying_reader.Get()));
}
};
class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddComment(R"DOC(
CreateDoubleBufferReader Operator
A double buffer reader takes another reader as its 'underlying reader'.
It launches another thread to execute the 'underlying reader' asynchronously,
which prevents reading process from blocking subsequent training.
)DOC");
}
};
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear();
buffer_->Receive(out);
}
void DoubleBufferReader::ReInit() {
reader_->ReInit();
buffer_->Close();
// The existing prefetch thread will terminate for the buffer_ is closed.
buffer_ = framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize);
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
prefetch.detach();
}
void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts.";
while (true) {
std::vector<framework::LoDTensor> batch;
reader_->ReadNext(&batch);
if (batch.empty()) {
// EOF
buffer_->Close();
VLOG(5) << "Reached the end of the file. The prefetch thread terminates.";
break;
}
if (!buffer_->Send(&batch)) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread terminates.";
break;
}
}
}
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader,
ops::CreateDoubleBufferReaderOp,
ops::CreateDoubleBufferReaderOpMaker);
......@@ -50,8 +50,6 @@ class RandomDataGenerator : public framework::FileReader {
}
}
bool HasNext() const override { return true; }
void ReInit() override { return; }
private:
......
......@@ -39,10 +39,10 @@ void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
buffer_.clear();
buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
if (buffer_.back().empty()) {
buffer_.pop_back();
break;
}
}
......
......@@ -49,6 +49,10 @@ FileReaderMakerBase::FileReaderMakerBase(
}
void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(
!ctx->IsRuntime(),
"'FileReaderInferShape' should only be invoked during compile time.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null.");
const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
......@@ -56,16 +60,14 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
ctx->SetReaderDims("Out", shapes);
if (ctx->IsRuntime()) {
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d).",
lod_levels.size(), shapes.size());
framework::VarDesc* reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
reader->SetLoDLevels(lod_levels);
}
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d).",
lod_levels.size(), shapes.size());
framework::VarDesc* reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
reader->SetLoDLevels(lod_levels);
}
void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
......@@ -77,19 +79,21 @@ void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
void DecoratedReaderInferShape::operator()(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(!ctx->IsRuntime(),
"'DecoratedReaderInferShape' should only be invoked during "
"compile time.");
PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
"Input(UnderlyingReader) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
if (ctx->IsRuntime()) {
framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
ctx->GetInputVarPtrs("UnderlyingReader")[0]);
framework::VarDesc* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
ctx->GetInputVarPtrs("UnderlyingReader")[0]);
framework::VarDesc* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
......
......@@ -24,15 +24,15 @@ limitations under the License. */
namespace paddle {
namespace operators {
static bool IsVariableInitialized(const framework::Scope& scope,
const std::string& varname) {
static bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().value().IsInitialized();
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
......@@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase {
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (IsVariableInitialized(scope, ins[i])) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
......
......@@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel {
// and run time.
ctx->SetOutputDim("ParamOut", param_dim);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
ctx.GetPlace());
}
};
class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
......
......@@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(param, param_out);
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad->rows().size() == 0) {
return;
}
auto in_height = grad->height();
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
......@@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>();
auto* lr = learning_rate->data<T>();
for (size_t i = 0; i < in_rows.size(); i++) {
PADDLE_ENFORCE(in_rows[i] < in_height,
"Input rows index should less than height");
for (int64_t j = 0; j < in_row_numel; j++) {
out_data[in_rows[i] * in_row_numel + j] -=
lr[0] * in_data[i * in_row_numel + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
}
......
......@@ -21,15 +21,24 @@ limitations under the License. */
namespace paddle {
namespace operators {
static int FindOutIdx(int row, const std::vector<int>& height_sections) {
int offset = 0;
for (size_t i = 0; i < height_sections.size(); ++i) {
if (row >= offset && row < (offset + height_sections[i])) {
return i;
static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
for (size_t i = 1; i < abs_sections.size(); ++i) {
if (row < abs_sections[i]) {
return i - 1;
}
offset += height_sections[i];
}
return -1;
return abs_sections.size() - 1;
}
static std::vector<int> ToAbsoluteSection(
const std::vector<int>& height_sections) {
std::vector<int> abs_sections;
abs_sections.resize(height_sections.size());
abs_sections[0] = 0;
for (size_t i = 1; i < height_sections.size(); ++i) {
abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
}
return abs_sections;
}
template <typename DeviceContext, typename T>
......@@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
auto abs_sections = ToAbsoluteSection(height_sections);
auto x_rows = x->rows();
std::vector<std::vector<int>> outs_rows_idx;
std::vector<std::vector<int>> outs_dense_idx;
outs_rows_idx.resize(outs.size());
outs_dense_idx.resize(outs.size());
auto row_numel = x->value().numel() / x->value().dims()[0];
auto src = x->value().data<T>();
// split rows index into output sparse vars
for (size_t i = 0; i < x_rows.size(); ++i) {
int out_idx = FindOutIdx(x_rows[i], height_sections);
outs_rows_idx[out_idx].push_back(i);
int out_idx = FindOutIdx(x_rows[i], abs_sections);
outs_rows_idx[out_idx].push_back(x_rows[i]);
outs_dense_idx[out_idx].push_back(i);
}
auto place = ctx.GetPlace();
......@@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
dims[0] = rows_idx.size();
outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
for (auto idx : rows_idx) {
outs[i]->mutable_rows()->push_back(x_rows[idx]);
outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
}
auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
for (size_t j = 0; j < rows_idx.size(); j++) {
if (platform::is_cpu_place(place)) {
memory::Copy(platform::CPUPlace(), dst + j * row_numel,
platform::CPUPlace(), src + rows_idx[j] * row_numel,
sizeof(T) * row_numel);
memory::Copy(
platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
} else {
#ifdef PADDLE_WITH_CUDA
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
platform::CUDAPlace(), src + rows_idx[j] * row_numel,
platform::CUDAPlace(),
src + outs_dense_idx[i][j] * row_numel,
sizeof(T) * row_numel, stream);
#else
PADDLE_THROW("Paddle is not compiled with GPU");
......
......@@ -76,10 +76,16 @@ class SumOp : public framework::OperatorWithKernel {
static_cast<framework::proto::VarType::Type>(dtype),
ctx.device_context());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::OpKernelType(
framework::ToDataType(
x_vars[0]->Get<framework::SelectedRows>().value().type()),
ctx.device_context());
for (auto& var : x_vars) {
auto& value = var->Get<framework::SelectedRows>().value();
if (value.IsInitialized()) {
return framework::OpKernelType(framework::ToDataType(value.type()),
ctx.device_context());
}
}
// if input sparse vars are not initialized, use an default kernel type.
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.device_context());
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
for (auto& x_var : x_vars) {
auto& array = x_var->Get<framework::LoDTensorArray>();
......
......@@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel<T> {
in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim));
// if all the input sparse vars are empty, no need to
// merge these vars.
if (first_dim == 0UL) {
return;
}
out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor;
......@@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel<T> {
int64_t offset = 0;
for (int i = 0; i < N; i++) {
auto &sel_row = get_selected_row(i);
if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
if (sel_row.rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
......
......@@ -48,7 +48,6 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
......
......@@ -127,6 +127,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device);
compute_capability = GetCUDAComputeCapability(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
......@@ -162,6 +163,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaGetLastError());
}
int CUDADeviceContext::GetComputeCapability() const {
return compute_capability;
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp;
}
......
......@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return compute capability in the device context. */
int GetComputeCapability() const;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
......@@ -104,6 +107,7 @@ class CUDADeviceContext : public DeviceContext {
cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_;
int compute_capability;
int multi_process;
int max_threads_per_mp;
};
......
......@@ -68,6 +68,8 @@ extern void *cublas_dso_handle;
__macro(cublasDgemv_v2); \
__macro(cublasSgemm_v2); \
__macro(cublasDgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \
......@@ -83,6 +85,7 @@ extern void *cublas_dso_handle;
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); \
__macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \
......
......@@ -33,6 +33,15 @@ int GetCUDADeviceCount() {
return count;
}
int GetCUDAComputeCapability(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
cudaDeviceProp device_prop;
PADDLE_ENFORCE(cudaGetDeviceProperties(&device_prop, id),
"cudaGetDeviceProperties failed in "
"paddle::platform::GetCUDAComputeCapability");
return device_prop.major * 10 + device_prop.minor;
}
int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;
......
......@@ -30,6 +30,9 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system.
int GetCUDADeviceCount();
//! Get the compute capability of the ith GPU (format: major * 10 + minor)
int GetCUDAComputeCapability(int i);
//! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/device_vector.h>
#include <memory>
#include <vector>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/init.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
static int dev_count = 0;
namespace paddle {
namespace platform {
TEST(NCCL, init) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
template <typename T>
struct PerThreadData {
thrust::device_vector<T> send_buff;
thrust::device_vector<T> recv_buff;
CUDADeviceContext dev_ctx;
T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
send_buff.resize(size);
for (size_t i = 0; i < size; ++i) {
send_buff[i] = static_cast<T>(i);
}
recv_buff.resize(size);
}
};
static constexpr int ELEM_COUNT = 10000;
TEST(NCCL, all_reduce) {
std::vector<ncclComm_t> comms;
comms.resize(dev_count);
VLOG(1) << "Initializing ncclComm";
dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
VLOG(1) << "ncclComm initialized";
VLOG(1) << "Creating thread data";
std::vector<std::unique_ptr<PerThreadData<double>>> data;
data.reserve(dev_count);
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Creating thread data for device " << i;
SetDeviceId(i);
data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
}
VLOG(1) << "Thread data created";
VLOG(1) << "Check send_buf data";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Check on device " << i;
SetDeviceId(i);
thrust::host_vector<double> tmp = data[i]->send_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
}
}
VLOG(1) << "Invoking ncclAllReduce";
dynload::ncclGroupStart();
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Invoking ncclAllReduce with device " << i;
SetDeviceId(i);
PADDLE_ENFORCE(dynload::ncclAllReduce(
data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
ncclSum, comms[i], data[i]->dev_ctx.stream()));
VLOG(1) << "Invoked ncclAllReduce for device " << i;
}
dynload::ncclGroupEnd();
VLOG(1) << "Invoked ncclAllReduce";
VLOG(1) << "Sync devices";
for (int i = 0; i < dev_count; ++i) {
VLOG(1) << "Sync device " << i;
SetDeviceId(i);
data[i]->dev_ctx.Wait();
}
VLOG(1) << "device synced";
for (int i = 0; i < dev_count; ++i) {
SetDeviceId(i);
VLOG(1) << "Checking vector on device " << i;
thrust::host_vector<double> tmp = data[i]->recv_buff;
for (size_t j = 0; j < tmp.size(); ++j) {
auto elem = static_cast<double>(j);
elem *= dev_count;
ASSERT_NEAR(tmp[j], elem, 1e-4);
}
}
for (int i = 0; i < dev_count; ++i) {
dynload::ncclCommDestroy(comms[i]);
}
}
} // namespace platform
} // namespace paddle
int main(int argc, char** argv) {
dev_count = paddle::platform::GetCUDADeviceCount();
if (dev_count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
<< dev_count;
return 0;
}
std::vector<paddle::platform::Place> places;
places.emplace_back(paddle::platform::CPUPlace());
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(paddle::platform::CUDAPlace(i));
}
VLOG(0) << " DeviceCount " << count;
paddle::platform::DeviceContextPool::Init(places);
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
......@@ -161,6 +161,8 @@ void BindBlockDesc(py::module &m) {
py::return_value_policy::reference)
.def("prepend_op", &BlockDesc::PrependOp,
py::return_value_policy::reference)
.def("insert_op", &BlockDesc::InsertOp,
py::return_value_policy::reference)
.def("remove_op", &BlockDesc::RemoveOp)
.def("var",
[](BlockDesc &self, py::bytes byte_name) {
......
......@@ -37,7 +37,7 @@ from distribute_transpiler_simple import SimpleDistributeTranspiler
from concurrency import (Go, make_channel, channel_send, channel_recv,
channel_close)
import clip
from memory_optimization_transpiler import memory_optimize
from memory_optimization_transpiler import memory_optimize, release_memory
import profiler
import unique_name
......@@ -63,6 +63,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'memory_optimize',
'release_memory',
'profiler',
'unique_name',
]
......
......@@ -457,7 +457,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
"Out": [_append_grad_suffix_(loss.name)]
}, {"shape": [1],
"value": 1.0,
"dtype": loss.dtype})
"dtype": loss.dtype,
"force_cpu": False})
root_block.desc.append_op().copy_from(op_desc)
block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
......
......@@ -102,6 +102,9 @@ def save_vars(executor,
save_var_map = {}
for each_var in vars:
# NOTE: don't save the variable which type is RAW
if each_var.type == core.VarDesc.VarType.RAW:
continue
new_var = _clone_var_in_block_(save_block, each_var)
if filename is None:
save_block.append_op(
......
......@@ -130,8 +130,13 @@ def detection_output(loc,
target_box=loc,
code_type='decode_center_size')
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
old_shape = scores.shape
scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
scores = ops.softmax(x=scores)
scores = ops.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1])
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
helper.append_op(
type="multiclass_nms",
inputs={'Scores': scores,
......@@ -562,16 +567,16 @@ def multi_box_head(inputs,
base_size,
num_classes,
aspect_ratios,
min_ratio,
max_ratio,
min_ratio=None,
max_ratio=None,
min_sizes=None,
max_sizes=None,
steps=None,
step_w=None,
step_h=None,
offset=0.5,
variance=[0.1, 0.1, 0.1, 0.1],
flip=False,
variance=[0.1, 0.1, 0.2, 0.2],
flip=True,
clip=False,
kernel_size=1,
pad=0,
......@@ -614,7 +619,7 @@ def multi_box_head(inputs,
the inputs[i] will be automatically calculated. Default: None.
offset(float): Prior boxes center offset. Default: 0.5
variance(list|tuple): the variances to be encoded in prior boxes.
Default:[0.1, 0.1, 0.1, 0.1].
Default:[0.1, 0.1, 0.2, 0.2].
flip(bool): Whether to flip aspect ratios. Default:False.
clip(bool): Whether to clip out-of-boundary boxes. Default: False.
kernel_size(int): The kernel size of conv2d. Default: 1.
......@@ -668,6 +673,19 @@ def multi_box_head(inputs,
helper = LayerHelper("prior_box", **locals())
dtype = helper.input_dtype()
attrs = {
'min_sizes': min_sizes,
'aspect_ratios': aspect_ratios,
'variances': variance,
'flip': flip,
'clip': clip,
'step_w': step_w,
'step_h': step_h,
'offset': offset
}
if len(max_sizes) > 0 and max_sizes[0] > 0:
attrs['max_sizes'] = max_sizes
box = helper.create_tmp_variable(dtype)
var = helper.create_tmp_variable(dtype)
helper.append_op(
......@@ -676,17 +694,7 @@ def multi_box_head(inputs,
"Image": image},
outputs={"Boxes": box,
"Variances": var},
attrs={
'min_sizes': min_sizes,
'max_sizes': max_sizes,
'aspect_ratios': aspect_ratios,
'variances': variance,
'flip': flip,
'clip': clip,
'step_w': step_w,
'step_h': step_h,
'offset': offset
})
attrs=attrs, )
return box, var
def _reshape_with_axis_(input, axis=1):
......@@ -714,7 +722,7 @@ def multi_box_head(inputs,
if num_layer <= 2:
assert min_sizes is not None and max_sizes is not None
assert len(min_sizes) == num_layer and len(max_sizes) == num_layer
else:
elif min_sizes is None and max_sizes is None:
min_sizes = []
max_sizes = []
step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
......@@ -759,9 +767,6 @@ def multi_box_head(inputs,
min_size = [min_size]
if not _is_list_or_tuple_(max_size):
max_size = [max_size]
if not (len(max_size) == len(min_size)):
raise ValueError(
'the length of max_size and min_size should be equal.')
aspect_ratio = []
if aspect_ratios is not None:
......@@ -779,7 +784,7 @@ def multi_box_head(inputs,
num_boxes = box.shape[2]
# get box_loc
# get loc
num_loc_output = num_boxes * 4
mbox_loc = nn.conv2d(
input=input,
......@@ -796,7 +801,7 @@ def multi_box_head(inputs,
mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
mbox_locs.append(mbox_loc_flatten)
# get conf_loc
# get conf
num_conf_output = num_boxes * num_classes
conf_loc = nn.conv2d(
input=input,
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册