Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
740e1626
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
740e1626
编写于
12月 17, 2018
作者:
Y
Yu Yang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into feature/refine_w2v
test=develop
上级
7b10bf0e
3628d894
变更
110
显示空白变更内容
内联
并排
Showing
110 changed file
with
5468 addition
and
1125 deletion
+5468
-1125
README.md
README.md
+81
-0
benchmark/fluid/fluid_benchmark.py
benchmark/fluid/fluid_benchmark.py
+3
-1
cmake/external/brpc.cmake
cmake/external/brpc.cmake
+12
-8
cmake/external/gtest.cmake
cmake/external/gtest.cmake
+7
-3
cmake/external/leveldb.cmake
cmake/external/leveldb.cmake
+2
-2
paddle/fluid/API.spec
paddle/fluid/API.spec
+3
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+10
-3
paddle/fluid/framework/data_type_test.cc
paddle/fluid/framework/data_type_test.cc
+1
-1
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+14
-10
paddle/fluid/framework/details/computation_op_handle.cc
paddle/fluid/framework/details/computation_op_handle.cc
+4
-2
paddle/fluid/framework/details/computation_op_handle.h
paddle/fluid/framework/details/computation_op_handle.h
+5
-1
paddle/fluid/framework/details/eager_deletion_op_handle.cc
paddle/fluid/framework/details/eager_deletion_op_handle.cc
+122
-0
paddle/fluid/framework/details/eager_deletion_op_handle.h
paddle/fluid/framework/details/eager_deletion_op_handle.h
+58
-0
paddle/fluid/framework/details/eager_deletion_pass.cc
paddle/fluid/framework/details/eager_deletion_pass.cc
+101
-0
paddle/fluid/framework/details/eager_deletion_pass.h
paddle/fluid/framework/details/eager_deletion_pass.h
+32
-0
paddle/fluid/framework/details/multi_devices_graph_pass.cc
paddle/fluid/framework/details/multi_devices_graph_pass.cc
+3
-3
paddle/fluid/framework/details/op_graph_view.cc
paddle/fluid/framework/details/op_graph_view.cc
+3
-0
paddle/fluid/framework/details/op_graph_view.h
paddle/fluid/framework/details/op_graph_view.h
+28
-1
paddle/fluid/framework/details/reference_count_op_handle.h
paddle/fluid/framework/details/reference_count_op_handle.h
+0
-138
paddle/fluid/framework/details/reference_count_pass.cc
paddle/fluid/framework/details/reference_count_pass.cc
+199
-147
paddle/fluid/framework/details/reference_count_pass.h
paddle/fluid/framework/details/reference_count_pass.h
+0
-5
paddle/fluid/framework/details/reference_count_pass_helper.cc
...le/fluid/framework/details/reference_count_pass_helper.cc
+21
-0
paddle/fluid/framework/details/reference_count_pass_helper.h
paddle/fluid/framework/details/reference_count_pass_helper.h
+51
-0
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
...id/framework/details/scope_buffered_ssa_graph_executor.cc
+0
-18
paddle/fluid/framework/executor.cc
paddle/fluid/framework/executor.cc
+99
-44
paddle/fluid/framework/executor.h
paddle/fluid/framework/executor.h
+13
-40
paddle/fluid/framework/executor_thread_worker.cc
paddle/fluid/framework/executor_thread_worker.cc
+3
-0
paddle/fluid/framework/garbage_collector.cc
paddle/fluid/framework/garbage_collector.cc
+89
-0
paddle/fluid/framework/garbage_collector.h
paddle/fluid/framework/garbage_collector.h
+59
-92
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+2
-0
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+106
-0
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
...fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+105
-0
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
.../fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+33
-0
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
.../fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+104
-0
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
...e/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+33
-0
paddle/fluid/framework/ir/graph.h
paddle/fluid/framework/ir/graph.h
+9
-2
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+112
-1
paddle/fluid/framework/ir/graph_pattern_detector.h
paddle/fluid/framework/ir/graph_pattern_detector.h
+45
-0
paddle/fluid/framework/ir/pass.h
paddle/fluid/framework/ir/pass.h
+9
-2
paddle/fluid/framework/op_kernel_type_test.cc
paddle/fluid/framework/op_kernel_type_test.cc
+2
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+2
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+100
-40
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+1
-23
paddle/fluid/framework/scope.cc
paddle/fluid/framework/scope.cc
+6
-0
paddle/fluid/framework/scope.h
paddle/fluid/framework/scope.h
+1
-0
paddle/fluid/framework/tensor.h
paddle/fluid/framework/tensor.h
+5
-2
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+6
-1
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+1
-1
paddle/fluid/inference/api/paddle_pass_builder.h
paddle/fluid/inference/api/paddle_pass_builder.h
+4
-1
paddle/fluid/inference/io.cc
paddle/fluid/inference/io.cc
+1
-1
paddle/fluid/inference/tests/api/trt_models_tester.cc
paddle/fluid/inference/tests/api/trt_models_tester.cc
+24
-1
paddle/fluid/operators/controlflow/CMakeLists.txt
paddle/fluid/operators/controlflow/CMakeLists.txt
+1
-1
paddle/fluid/operators/controlflow/while_op.cc
paddle/fluid/operators/controlflow/while_op.cc
+28
-2
paddle/fluid/operators/conv_op.cc
paddle/fluid/operators/conv_op.cc
+3
-1
paddle/fluid/operators/cudnn_lstm_op.cu.cc
paddle/fluid/operators/cudnn_lstm_op.cu.cc
+2
-0
paddle/fluid/operators/distributed/CMakeLists.txt
paddle/fluid/operators/distributed/CMakeLists.txt
+19
-12
paddle/fluid/operators/distributed/brpc_client.cc
paddle/fluid/operators/distributed/brpc_client.cc
+313
-58
paddle/fluid/operators/distributed/brpc_client.h
paddle/fluid/operators/distributed/brpc_client.h
+82
-17
paddle/fluid/operators/distributed/brpc_rdma_pool.cc
paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+84
-0
paddle/fluid/operators/distributed/brpc_rdma_pool.h
paddle/fluid/operators/distributed/brpc_rdma_pool.h
+56
-0
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+196
-0
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+49
-0
paddle/fluid/operators/distributed/brpc_serde_test.cc
paddle/fluid/operators/distributed/brpc_serde_test.cc
+175
-0
paddle/fluid/operators/distributed/brpc_server.cc
paddle/fluid/operators/distributed/brpc_server.cc
+235
-29
paddle/fluid/operators/distributed/brpc_variable_response.cc
paddle/fluid/operators/distributed/brpc_variable_response.cc
+73
-0
paddle/fluid/operators/distributed/brpc_variable_response.h
paddle/fluid/operators/distributed/brpc_variable_response.h
+67
-0
paddle/fluid/operators/distributed/grpc_client.cc
paddle/fluid/operators/distributed/grpc_client.cc
+1
-2
paddle/fluid/operators/distributed/grpc_serde.cc
paddle/fluid/operators/distributed/grpc_serde.cc
+0
-7
paddle/fluid/operators/distributed/rpc_server.h
paddle/fluid/operators/distributed/rpc_server.h
+4
-0
paddle/fluid/operators/distributed/sendrecvop_utils.cc
paddle/fluid/operators/distributed/sendrecvop_utils.cc
+1
-1
paddle/fluid/operators/distributed/sendrecvop_utils.h
paddle/fluid/operators/distributed/sendrecvop_utils.h
+7
-0
paddle/fluid/operators/distributed_ops/CMakeLists.txt
paddle/fluid/operators/distributed_ops/CMakeLists.txt
+2
-2
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+4
-3
paddle/fluid/operators/distributed_ops/send_op.cc
paddle/fluid/operators/distributed_ops/send_op.cc
+2
-0
paddle/fluid/operators/math/pooling.cc
paddle/fluid/operators/math/pooling.cc
+153
-62
paddle/fluid/operators/math/pooling.cu
paddle/fluid/operators/math/pooling.cu
+268
-147
paddle/fluid/operators/math/pooling.h
paddle/fluid/operators/math/pooling.h
+22
-10
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+62
-3
paddle/fluid/operators/pool_op.h
paddle/fluid/operators/pool_op.h
+10
-6
paddle/fluid/operators/pool_with_index_op.cc
paddle/fluid/operators/pool_with_index_op.cc
+35
-3
paddle/fluid/operators/pool_with_index_op.h
paddle/fluid/operators/pool_with_index_op.h
+8
-4
paddle/fluid/operators/psroi_pool_op.cc
paddle/fluid/operators/psroi_pool_op.cc
+171
-0
paddle/fluid/operators/psroi_pool_op.cu
paddle/fluid/operators/psroi_pool_op.cu
+294
-0
paddle/fluid/operators/psroi_pool_op.h
paddle/fluid/operators/psroi_pool_op.h
+253
-0
paddle/fluid/operators/reader/ctr_reader.h
paddle/fluid/operators/reader/ctr_reader.h
+6
-6
paddle/fluid/operators/spp_op.h
paddle/fluid/operators/spp_op.h
+3
-3
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+8
-1
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+1
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+2
-8
paddle/fluid/platform/stream_callback_manager.cc
paddle/fluid/platform/stream_callback_manager.cc
+63
-0
paddle/fluid/platform/stream_callback_manager.h
paddle/fluid/platform/stream_callback_manager.h
+13
-48
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+9
-0
paddle/fluid/pybind/tensor_py.h
paddle/fluid/pybind/tensor_py.h
+6
-6
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+12
-0
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+4
-3
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+255
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+7
-3
python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
+6
-0
python/paddle/fluid/tests/unittests/ngraph/__init__.py
python/paddle/fluid/tests/unittests/ngraph/__init__.py
+13
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+12
-3
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
...d/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+86
-0
python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
...ddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+49
-0
python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
...dle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+50
-0
python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
...paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+27
-0
python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
.../fluid/tests/unittests/test_eager_deletion_transformer.py
+27
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+33
-0
python/paddle/fluid/tests/unittests/test_pool2d_op.py
python/paddle/fluid/tests/unittests/test_pool2d_op.py
+65
-26
python/paddle/fluid/tests/unittests/test_pool3d_op.py
python/paddle/fluid/tests/unittests/test_pool3d_op.py
+86
-35
python/paddle/fluid/tests/unittests/test_pool_max_op.py
python/paddle/fluid/tests/unittests/test_pool_max_op.py
+77
-18
python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+134
-0
未找到文件。
README.md
浏览文件 @
740e1626
...
...
@@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
欢迎来到 PaddlePaddle GitHub
PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。
我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
跟进PaddlePaddle最新特性请参考我们的
[
版本说明
](
https://github.com/PaddlePaddle/Paddle/releases
)
### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
### Install Latest Stable Release:
```
...
...
@@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
### 安装最新稳定版本:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.2.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.2.0.post85
# 其他平台上的安装指引请参考 http://paddlepaddle.org/
```
## Features
-
**Flexibility**
...
...
@@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85
Baidu and it has achieved a significant impact. We hope you can also explore
the capability of PaddlePaddle to make an impact on your product.
## 特点
-
**灵活性**
PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
-
**高效性**
为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例:
- 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
- 通过MKL-DNN库优化CNN网络
- 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列
- 针对高维稀疏数据模型,优化了局部和分布式训练。
-
**稳定性**
有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
-
**连接产品**
另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html
)
on our website.
## 安装
推荐阅读官网上的
[
安装说明
](
http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html
)
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html
)
and
...
...
@@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte
We appreciate your contributions!
## 文档
我们提供
[
英文
](
http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html
)
和
[
中文
](
http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html
)
文档
-
[
深度学习101
](
https://github.com/PaddlePaddle/book
)
或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行
-
[
分布式训练
](
http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html
)
可以在MPI集群上运行分布式训练任务
-
[
Python API
](
http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html
)
新的API支持代码更少更简洁的程序
-
[
贡献方式
](
http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html
)
欢迎您的贡献!
## Ask Questions
You are welcome to submit questions and bug reports as
[
Github Issues
](
https://github.com/PaddlePaddle/Paddle/issues
)
.
## 答疑
欢迎您将问题和bug报告以
[
Github Issues
](
https://github.com/PaddlePaddle/Paddle/issues
)
的形式提交
## Copyright and License
PaddlePaddle is provided under the
[
Apache-2.0 license
](
LICENSE
)
.
## 版权和许可证
PaddlePaddle由
[
Apache-2.0 license
](
LICENSE
)
提供
benchmark/fluid/fluid_benchmark.py
浏览文件 @
740e1626
...
...
@@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
# the role, should be either PSERVER or TRAINER
training_role
=
os
.
getenv
(
"PADDLE_TRAINING_ROLE"
)
config
=
distribute_transpiler
.
DistributeTranspilerConfig
()
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
slice_var_up
=
not
args
.
no_split_var
config
.
min_block_size
=
1048576
t
=
distribute_transpiler
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
,
# NOTE: *MUST* use train_prog, for we are using with guard to
...
...
cmake/external/brpc.cmake
浏览文件 @
740e1626
...
...
@@ -14,14 +14,16 @@
INCLUDE
(
ExternalProject
)
find_library
(
SSL_LIBRARY NAMES ssl
)
find_package
(
OpenSSL REQUIRED
)
message
(
STATUS
"ssl:"
${
OPENSSL_SSL_LIBRARY
}
)
message
(
STATUS
"crypto:"
${
OPENSSL_CRYPTO_LIBRARY
}
)
ADD_LIBRARY
(
ssl SHARED IMPORTED GLOBAL
)
SET_PROPERTY
(
TARGET ssl PROPERTY IMPORTED_LOCATION
${
SSL_LIBRARY
}
)
SET_PROPERTY
(
TARGET ssl PROPERTY IMPORTED_LOCATION
${
OPENSSL_
SSL_LIBRARY
}
)
find_library
(
CRYPTO_LIBRARY NAMES crypto
)
ADD_LIBRARY
(
crypto SHARED IMPORTED GLOBAL
)
SET_PROPERTY
(
TARGET crypto PROPERTY IMPORTED_LOCATION
${
CRYPTO_LIBRARY
}
)
SET_PROPERTY
(
TARGET crypto PROPERTY IMPORTED_LOCATION
${
OPENSSL_CRYPTO_LIBRARY
}
)
SET
(
BRPC_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/brpc
)
SET
(
BRPC_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/install/brpc
)
...
...
@@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
INCLUDE_DIRECTORIES
(
${
BRPC_INCLUDE_DIR
}
)
# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
set
(
prefix_path
"
${
THIRD_PARTY_PATH
}
/install/gflags|
${
THIRD_PARTY_PATH
}
/install/leveldb|
${
THIRD_PARTY_PATH
}
/install/snappy|
${
THIRD_PARTY_PATH
}
/install/gtest|
${
THIRD_PARTY_PATH
}
/install/protobuf|
${
THIRD_PARTY_PATH
}
/install/zlib"
)
set
(
prefix_path
"
${
THIRD_PARTY_PATH
}
/install/gflags|
${
THIRD_PARTY_PATH
}
/install/leveldb|
${
THIRD_PARTY_PATH
}
/install/snappy|
${
THIRD_PARTY_PATH
}
/install/gtest|
${
THIRD_PARTY_PATH
}
/install/protobuf|
${
THIRD_PARTY_PATH
}
/install/zlib
|
${
THIRD_PARTY_PATH
}
/install/glog
"
)
# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF
ExternalProject_Add
(
extern_brpc
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(gongwb): change to de newst repo when they changed.
GIT_REPOSITORY
"https://github.com/gongweibao/brpc"
GIT_TAG
"
7dc04defad1fd4173aae170c3fcbde131b65155a
"
GIT_TAG
"
e9b67ec1b7458f2af5fae76451afe1e27e01b4b4
"
PREFIX
${
BRPC_SOURCES_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=
${
CMAKE_CXX_COMPILER
}
...
...
@@ -50,7 +53,7 @@ ExternalProject_Add(
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=
${
THIRD_PARTY_BUILD_TYPE
}
-DCMAKE_PREFIX_PATH=
${
prefix_path
}
-D
BRPC_
WITH_GLOG=ON
-DWITH_GLOG=ON
-DIOBUF_WITH_HUGE_BLOCK=ON
-DBRPC_WITH_RDMA=
${
WITH_BRPC_RDMA
}
${
EXTERNAL_OPTIONAL_ARGS
}
...
...
@@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
SET_PROPERTY
(
TARGET brpc PROPERTY IMPORTED_LOCATION
${
BRPC_LIBRARIES
}
)
ADD_DEPENDENCIES
(
brpc extern_brpc
)
add_definitions
(
-DBRPC_WITH_GLOG
)
LIST
(
APPEND external_project_dependencies brpc
)
cmake/external/gtest.cmake
浏览文件 @
740e1626
...
...
@@ -12,8 +12,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF
(
WITH_TESTING
)
#FIXME:(gongwb) Move brpc's gtest dependency.
IF
(
WITH_TESTING
OR
(
WITH_DISTRIBUTE AND NOT WITH_GRPC
))
IF
(
WITH_TESTING
)
ENABLE_TESTING
()
ENDIF
(
WITH_TESTING
)
INCLUDE
(
ExternalProject
)
SET
(
GTEST_SOURCES_DIR
${
THIRD_PARTY_PATH
}
/gtest
)
...
...
@@ -76,4 +80,4 @@ IF(WITH_TESTING)
ADD_DEPENDENCIES
(
gtest_main extern_gtest
)
LIST
(
APPEND external_project_dependencies gtest gtest_main
)
ENDIF
(
WITH_TESTING
)
ENDIF
(
WITH_TESTING
OR
(
WITH_DISTRIBUTE AND NOT WITH_GRPC
)
)
cmake/external/leveldb.cmake
浏览文件 @
740e1626
...
...
@@ -24,8 +24,8 @@ ExternalProject_Add(
extern_leveldb
${
EXTERNAL_PROJECT_LOG_ARGS
}
PREFIX
${
LEVELDB_SOURCES_DIR
}
URL
"https://github.com/google/leveldb/archive/v1.18.tar.gz
"
URL_MD5
"73770de34a2a5ab34498d2e05b2b7fa0"
GIT_REPOSITORY
"https://github.com/google/leveldb
"
GIT_TAG v1.18
CONFIGURE_COMMAND
""
BUILD_COMMAND CXXFLAGS=-fPIC make -j
${
NUM_OF_PROCESSOR
}
libleveldb.a
INSTALL_COMMAND mkdir -p
${
LEVELDB_INSTALL_DIR
}
/lib/
...
...
paddle/fluid/API.spec
浏览文件 @
740e1626
...
...
@@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...
...
@@ -198,6 +200,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
cc_test
(
lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory
)
nv_test
(
lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor
)
cc_library
(
garbage_collector SRCS garbage_collector.cc DEPS device_context memory
)
cc_library
(
reader SRCS reader.cc DEPS lod_tensor ddim
)
cc_test
(
reader_test SRCS reader_test.cc DEPS reader
)
...
...
@@ -167,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library
(
naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper
)
cc_library
(
executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
lod_rank_table feed_fetch_method sendrecvop_rpc
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
if
(
WITH_NGRAPH
)
if
(
NOT WIN32
)
...
...
@@ -183,6 +188,8 @@ else()
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
target_link_libraries
(
executor garbage_collector
)
cc_library
(
parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
...
...
paddle/fluid/framework/data_type_test.cc
浏览文件 @
740e1626
...
...
@@ -35,6 +35,6 @@ TEST(DataType, float16) {
EXPECT_EQ
(
f
::
SizeOfType
(
dtype
),
2u
);
// test debug info
std
::
string
type
=
"float16"
;
std
::
string
type
=
"
::paddle::platform::
float16"
;
EXPECT_STREQ
(
f
::
DataTypeToString
(
dtype
).
c_str
(),
type
.
c_str
());
}
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc
cc_library
(
variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows
)
if
(
WITH_DISTRIBUTE
)
if
(
NOT WITH_GRPC
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
reduce_op_handle.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
endif
()
endif
()
if
(
WITH_GPU
)
nv_library
(
all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda variable_visitor
)
if
(
WITH_DISTRIBUTE
)
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor sendrecvop_
g
rpc
)
ddim dynload_cuda selected_rows_functor sendrecvop_rpc
)
else
()
nv_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim dynload_cuda selected_rows_functor
)
...
...
@@ -30,7 +37,7 @@ else()
variable_visitor
)
if
(
WITH_DISTRIBUTE
)
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor sendrecvop_
g
rpc
)
ddim selected_rows_functor sendrecvop_rpc
)
else
()
cc_library
(
reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
ddim selected_rows_functor
)
...
...
@@ -45,10 +52,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s
cc_library
(
modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper
)
if
(
WITH_GPU
)
cc_library
(
reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance
_op_handle graph graph_helper pass
)
endif
(
)
cc_library
(
reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle
)
cc_library
(
eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper
)
cc_library
(
eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion
_op_handle graph graph_helper pass
)
cc_library
(
reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper
)
cc_library
(
sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass
)
cc_library
(
all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass
)
...
...
@@ -56,10 +63,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass
)
if
(
WITH_GPU
)
list
(
APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass
)
endif
()
set
(
SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS
${
SSA_GRAPH_EXECUTOR_DEPS
}
)
...
...
paddle/fluid/framework/details/computation_op_handle.cc
浏览文件 @
740e1626
...
...
@@ -20,11 +20,13 @@ namespace paddle {
namespace
framework
{
namespace
details
{
ComputationOpHandle
::
ComputationOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
platform
::
Place
place
)
platform
::
Place
place
,
size_t
scope_idx
)
:
OpHandleBase
(
node
),
op_
(
framework
::
OpRegistry
::
CreateOp
(
*
node
->
Op
())),
scope_
(
scope
),
place_
(
place
)
{}
place_
(
place
),
scope_idx_
(
scope_idx
)
{}
void
ComputationOpHandle
::
RunImpl
()
{
WaitInputVarGenerated
(
place_
);
...
...
paddle/fluid/framework/details/computation_op_handle.h
浏览文件 @
740e1626
...
...
@@ -28,7 +28,8 @@ namespace framework {
namespace
details
{
struct
ComputationOpHandle
:
public
OpHandleBase
{
public:
ComputationOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
platform
::
Place
place
);
ComputationOpHandle
(
ir
::
Node
*
node
,
Scope
*
scope
,
platform
::
Place
place
,
size_t
scope_idx
);
std
::
string
Name
()
const
override
;
...
...
@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {
void
SetLockAndRecordEventFree
(
bool
b
)
{
is_lock_and_record_event_free_
=
b
;
}
size_t
GetScopeIdx
()
const
{
return
scope_idx_
;
}
protected:
void
RunImpl
()
override
;
...
...
@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
std
::
unique_ptr
<
OperatorBase
>
op_
;
Scope
*
scope_
;
platform
::
Place
place_
;
size_t
scope_idx_
;
bool
is_lock_and_record_event_free_
{
false
};
};
}
// namespace details
...
...
paddle/fluid/framework/details/eager_deletion_op_handle.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
details
{
EagerDeletionOpHandle
::
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
std
::
string
>
&
var_names
,
GarbageCollector
*
gc
,
AtomicReferenceCountMap
*
ref_cnts
)
:
OpHandleBase
(
node
),
scope_
(
scope
),
var_names_
(
var_names
),
gc_
(
gc
),
ref_cnts_
(
ref_cnts
)
{
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place
))
{
dev_ctx_
=
reinterpret_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
dynamic_cast
<
StreamGarbageCollector
*>
(
gc_
))
{
platform
::
CUDADeviceGuard
guard
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
).
device
);
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
PADDLE_ENFORCE_NOT_NULL
(
event_
);
}
}
#endif
}
EagerDeletionOpHandle
::~
EagerDeletionOpHandle
()
{
#ifdef PADDLE_WITH_CUDA
if
(
event_
)
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx_
->
GetPlace
());
platform
::
CUDADeviceGuard
guard
(
gpu_place
.
device
);
PADDLE_ENFORCE
(
cudaEventDestroy
(
event_
));
}
#endif
}
std
::
string
EagerDeletionOpHandle
::
Name
()
const
{
return
"eager_deletion"
;
}
void
EagerDeletionOpHandle
::
RunImpl
()
{
auto
*
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
garbages
;
for
(
auto
&
name
:
var_names_
)
{
auto
it
=
ref_cnts_
->
find
(
name
);
// Var not found, not reference count has not decreased to 0
if
(
it
==
ref_cnts_
->
end
()
||
it
->
second
.
fetch_sub
(
1
)
!=
1
)
{
continue
;
}
auto
*
var
=
exec_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
)
{
continue
;
}
VLOG
(
2
)
<<
"Erase variable "
<<
name
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
garbages
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
garbages
.
emplace_back
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
LoDTensorArray
>
())
{
auto
*
tensor_arr
=
var
->
GetMutable
<
LoDTensorArray
>
();
for
(
auto
&
t
:
*
tensor_arr
)
{
garbages
.
emplace_back
(
t
.
MoveMemoryHolder
());
}
}
else
{
PADDLE_THROW
(
"Type %s of %s is not supported eager deletion"
,
var
->
Type
().
name
(),
name
);
}
}
if
(
!
garbages
.
empty
())
{
ClearGarbages
(
&
garbages
);
}
}
void
EagerDeletionOpHandle
::
ClearGarbages
(
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
*
garbages
)
{
#ifdef PADDLE_WITH_CUDA
if
(
event_
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
callback_stream
=
reinterpret_cast
<
StreamGarbageCollector
*>
(
gc_
)
->
stream
();
auto
callback_func
=
[
=
]()
{
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
};
gc_
->
Add
(
std
::
move
(
*
garbages
),
callback_func
);
}
else
{
#endif
gc_
->
Add
(
std
::
move
(
*
garbages
));
#ifdef PADDLE_WITH_CUDA
}
#endif
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/eager_deletion_op_handle.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <deque>
#include <string>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
namespace
paddle
{
namespace
framework
{
class
Scope
;
namespace
details
{
class
EagerDeletionOpHandle
:
public
OpHandleBase
{
public:
EagerDeletionOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
Place
&
place
,
const
std
::
unordered_set
<
std
::
string
>
&
var_names
,
GarbageCollector
*
gc
,
AtomicReferenceCountMap
*
ref_cnts
);
~
EagerDeletionOpHandle
();
std
::
string
Name
()
const
override
;
protected:
void
RunImpl
()
override
;
private:
void
ClearGarbages
(
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
*
garbages
);
const
Scope
*
scope_
;
std
::
unordered_set
<
std
::
string
>
var_names_
;
GarbageCollector
*
gc_
;
// not own
AtomicReferenceCountMap
*
ref_cnts_
;
// not own
#ifdef PADDLE_WITH_CUDA
platform
::
CUDADeviceContext
*
dev_ctx_
{
nullptr
};
cudaEvent_t
event_
{
nullptr
};
#endif
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/eager_deletion_pass.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <queue>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_pass.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
std
::
unique_ptr
<
ir
::
Graph
>
EagerDeletionPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
&
ref_cnts
=
Get
<
std
::
vector
<
AtomicReferenceCountMap
>>
(
kRuntimeReferenceCount
);
PADDLE_ENFORCE
(
ref_cnts
.
empty
(),
"kRuntimeReferenceCount should be initialized here!"
);
const
auto
&
vars
=
graph
->
Get
<
GraphVars
>
(
kGraphVars
);
ref_cnts
.
resize
(
vars
.
size
());
const
auto
&
last_live_ops
=
Get
<
std
::
vector
<
LastLiveOpsOfVars
>>
(
kLastLiveOpsOfVars
);
const
auto
&
gcs
=
Get
<
GarbageCollectorMap
>
(
kGarbageCollector
);
const
auto
&
places
=
Get
<
std
::
vector
<
platform
::
Place
>>
(
kAllPlaces
);
// a reverse map of last_live_ops
// i.e., last op --> variable names which can be deleted.
std
::
unordered_map
<
ComputationOpHandle
*
,
std
::
unordered_set
<
std
::
string
>>
op_vars_map
;
for
(
auto
&
var_ops_map
:
last_live_ops
)
{
for
(
auto
&
var_ops_pair
:
var_ops_map
)
{
const
std
::
string
&
var_name
=
var_ops_pair
.
first
;
for
(
auto
*
op
:
var_ops_pair
.
second
)
{
op_vars_map
[
op
].
insert
(
var_name
);
}
}
}
for
(
auto
&
pair
:
op_vars_map
)
{
auto
*
op
=
pair
.
first
;
auto
&
var_names
=
pair
.
second
;
auto
*
eager_deletion_node
=
graph
->
CreateEmptyNode
(
"eager_deletion"
,
ir
::
Node
::
Type
::
kOperation
);
auto
*
eager_deletion_op
=
new
EagerDeletionOpHandle
(
eager_deletion_node
,
op
->
GetScope
(),
op
->
GetPlace
(),
var_names
,
gcs
.
at
(
places
[
op
->
GetScopeIdx
()]).
get
(),
&
(
ref_cnts
[
op
->
GetScopeIdx
()]));
auto
it
=
std
::
find_if
(
op
->
Outputs
().
begin
(),
op
->
Outputs
().
end
(),
[](
VarHandleBase
*
var
)
{
return
dynamic_cast
<
DummyVarHandle
*>
(
var
)
!=
nullptr
;
});
if
(
it
!=
op
->
Outputs
().
end
())
{
eager_deletion_op
->
AddInput
(
*
it
);
}
else
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
op
->
AddOutput
(
dep_var
);
eager_deletion_op
->
AddInput
(
dep_var
);
}
auto
*
dummy_leaf
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dummy_leaf
);
eager_deletion_op
->
AddOutput
(
dummy_leaf
);
}
VLOG
(
10
)
<<
"Create "
<<
op_vars_map
.
size
()
<<
" EagerDeletionOpHandle(s)"
;
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
eager_deletion_pass
,
paddle
::
framework
::
details
::
EagerDeletionPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kRuntimeReferenceCount
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kLastLiveOpsOfVars
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kAllPlaces
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kGarbageCollector
);
paddle/fluid/framework/details/eager_deletion_pass.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
EagerDeletionPass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/multi_devices_graph_pass.cc
浏览文件 @
740e1626
...
...
@@ -565,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
int
dev_id
)
const
{
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
ComputationOpHandle
(
result
->
CreateOpNode
(
node
->
Op
()),
local_scopes_
[
dev_id
],
places_
[
dev_id
]));
local_scopes_
[
dev_id
],
places_
[
dev_id
]
,
dev_id
));
CreateOpHandleIOs
(
result
,
node
,
dev_id
);
}
...
...
@@ -688,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
for
(
size_t
scope_idx
=
0
;
scope_idx
<
num_places
;
++
scope_idx
)
{
auto
p
=
places_
[
scope_idx
];
auto
s
=
local_scopes_
[
scope_idx
];
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
ComputationOpHandle
(
result
->
CreateOpNode
(
node
->
Op
()),
s
,
p
));
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
ComputationOpHandle
(
result
->
CreateOpNode
(
node
->
Op
()),
s
,
p
,
scope_idx
));
CreateOpHandleIOs
(
result
,
node
,
scope_idx
);
}
}
...
...
paddle/fluid/framework/details/op_graph_view.cc
浏览文件 @
740e1626
...
...
@@ -23,6 +23,8 @@ namespace details {
OpGraphView
::
OpGraphView
(
const
std
::
vector
<
OpHandleBase
*>
&
ops
)
{
Build
(
ops
);
}
void
OpGraphView
::
Build
(
const
std
::
vector
<
OpHandleBase
*>
&
ops
)
{
preceding_ops_
.
clear
();
pending_ops_
.
clear
();
for
(
auto
&
op
:
ops
)
{
preceding_ops_
[
op
];
pending_ops_
[
op
];
...
...
@@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
std
::
unordered_set
<
OpHandleBase
*>
OpGraphView
::
AllOps
()
const
{
std
::
unordered_set
<
OpHandleBase
*>
ret
;
ret
.
reserve
(
preceding_ops_
.
size
());
for
(
auto
&
pair
:
preceding_ops_
)
{
ret
.
insert
(
pair
.
first
);
}
...
...
paddle/fluid/framework/details/op_graph_view.h
浏览文件 @
740e1626
...
...
@@ -14,7 +14,7 @@
#pragma once
#include <
memory
>
#include <
queue
>
#include <unordered_map>
#include <unordered_set>
#include <vector>
...
...
@@ -34,6 +34,11 @@ class OpGraphView {
bool
HasOp
(
OpHandleBase
*
op
)
const
;
// Use a visitor to visit all pending ops of op
// Stop when callback returns false
template
<
typename
Callback
>
bool
VisitAllPendingOps
(
OpHandleBase
*
op
,
Callback
&&
callback
)
const
;
private:
void
Build
(
const
std
::
vector
<
OpHandleBase
*>
&
ops
);
void
EnforceHasOp
(
OpHandleBase
*
op
)
const
;
...
...
@@ -44,6 +49,28 @@ class OpGraphView {
pending_ops_
;
};
template
<
typename
Callback
>
bool
OpGraphView
::
VisitAllPendingOps
(
OpHandleBase
*
op
,
Callback
&&
callback
)
const
{
EnforceHasOp
(
op
);
std
::
unordered_set
<
OpHandleBase
*>
visited
;
std
::
queue
<
OpHandleBase
*>
q
;
q
.
push
(
op
);
do
{
op
=
q
.
front
();
q
.
pop
();
for
(
auto
&
pending_op
:
pending_ops_
.
at
(
op
))
{
if
(
visited
.
count
(
pending_op
)
==
0
)
{
visited
.
insert
(
pending_op
);
if
(
!
callback
(
pending_op
))
{
return
false
;
}
}
}
}
while
(
!
q
.
empty
());
return
true
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reference_count_op_handle.h
已删除
100644 → 0
浏览文件 @
7b10bf0e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/garbage_collector.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
using
ReferenceCountMap
=
std
::
unordered_map
<
std
::
string
,
int
>
;
using
AtomicReferenceCountMap
=
std
::
unordered_map
<
std
::
string
,
std
::
atomic
<
int
>>
;
using
DeviceReferenceCountMap
=
std
::
unordered_map
<
int
,
std
::
unique_ptr
<
ReferenceCountMap
>>
;
using
AtomicDeviceReferenceCountMap
=
std
::
unordered_map
<
int
,
std
::
unique_ptr
<
AtomicReferenceCountMap
>>
;
using
DeviceGarbageCollectorMap
=
std
::
unordered_map
<
int
,
std
::
unique_ptr
<
GarbageCollector
<
framework
::
Tensor
>>>
;
class
ReferenceCountOpHandle
:
public
OpHandleBase
{
public:
ReferenceCountOpHandle
(
ir
::
Node
*
node
,
const
Scope
*
scope
,
const
platform
::
CUDAPlace
&
place
,
const
std
::
vector
<
std
::
string
>
&
var_names
,
GarbageCollector
<
Tensor
>
*
gc
,
AtomicReferenceCountMap
*
ref_cnts
)
:
OpHandleBase
(
node
),
scope_
(
scope
),
gc_
(
gc
),
ref_cnts_
(
ref_cnts
)
{
dev_ctx_
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
IsStreamGarabageCollector
())
{
platform
::
SetDeviceId
(
place
.
device
);
PADDLE_ENFORCE
(
cudaEventCreateWithFlags
(
&
event_
,
cudaEventDisableTiming
));
}
for
(
auto
&
name
:
var_names
)
AddVar
(
name
);
}
~
ReferenceCountOpHandle
()
{
if
(
IsStreamGarabageCollector
())
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dev_ctx_
->
GetPlace
());
platform
::
SetDeviceId
(
gpu_place
.
device
);
PADDLE_ENFORCE
(
cudaEventDestroy
(
event_
));
}
}
std
::
string
Name
()
const
override
{
return
"reference_count"
;
}
void
AddVar
(
const
std
::
string
&
name
)
{
auto
it
=
var_names_
.
find
(
name
);
if
(
it
!=
var_names_
.
end
())
++
(
it
->
second
);
else
var_names_
[
name
]
=
1
;
}
protected:
void
RunImpl
()
override
{
auto
*
exec_scope
=
scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
std
::
vector
<
Tensor
*>
tensors
;
for
(
auto
&
pair
:
var_names_
)
{
auto
&
name
=
pair
.
first
;
auto
it
=
ref_cnts_
->
find
(
name
);
if
(
it
==
ref_cnts_
->
end
())
continue
;
auto
*
var
=
exec_scope
->
FindVar
(
name
);
if
(
var
==
nullptr
)
continue
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
if
(
it
->
second
.
fetch_sub
(
pair
.
second
)
<=
pair
.
second
)
{
tensors
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
());
}
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
if
(
it
->
second
.
fetch_sub
(
pair
.
second
)
<=
pair
.
second
)
{
tensors
.
emplace_back
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
());
}
}
}
if
(
!
tensors
.
empty
())
{
ClearTensors
(
tensors
);
}
}
private:
void
ClearTensors
(
const
std
::
vector
<
Tensor
*>
&
tensors
)
{
auto
*
gc
=
dynamic_cast
<
StreamGarbageCollector
<
Tensor
>
*>
(
gc_
);
if
(
gc
!=
nullptr
)
{
auto
compute_stream
=
dev_ctx_
->
stream
();
auto
callback_stream
=
gc
->
stream
();
auto
callback_func
=
[
=
]()
{
PADDLE_ENFORCE
(
cudaEventRecord
(
event_
,
compute_stream
));
PADDLE_ENFORCE
(
cudaStreamWaitEvent
(
callback_stream
,
event_
,
0
));
};
gc_
->
Add
(
tensors
,
callback_func
);
}
else
{
gc_
->
Add
(
tensors
);
}
}
bool
IsStreamGarabageCollector
()
const
{
return
dynamic_cast
<
const
StreamGarbageCollector
<
Tensor
>
*>
(
gc_
)
!=
nullptr
;
}
const
Scope
*
scope_
;
platform
::
CUDADeviceContext
*
dev_ctx_
;
std
::
unordered_map
<
std
::
string
,
int
>
var_names_
;
GarbageCollector
<
Tensor
>
*
gc_
;
// not own
AtomicReferenceCountMap
*
ref_cnts_
;
// not own
cudaEvent_t
event_
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reference_count_pass.cc
浏览文件 @
740e1626
...
...
@@ -14,187 +14,240 @@
#include <queue>
#include <string>
#include <type_traits>
#include <vector>
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/op_graph_view.h"
#include "paddle/fluid/framework/details/reference_count_pass.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
ComputationOpHandle
*
FindNextComputationOpHandle
(
VarHandle
*
var_in
)
{
std
::
queue
<
VarHandleBase
*>
queue
;
queue
.
push
(
var_in
);
// A functor to shrink/remove operators who depend on other operators in a set
class
ShrinkDepsOpFunctor
{
private:
enum
RelationShip
{
kSame
=
0
,
kNoDeps
=
1
,
kBefore
=
2
,
kAfter
=
3
};
public:
explicit
ShrinkDepsOpFunctor
(
const
std
::
vector
<
OpHandleBase
*>
&
all_ops
)
:
graph_
(
all_ops
)
{}
template
<
typename
OpSet
>
OpSet
operator
()(
const
OpSet
&
op_set
)
const
{
using
KeyType
=
typename
OpSet
::
key_type
;
static_assert
(
std
::
is_base_of
<
OpHandleBase
,
typename
std
::
remove_pointer
<
KeyType
>::
type
>::
value
,
"Key type of OpSet must be OpHandleBase, or derived of OpHandleBase"
);
if
(
op_set
.
size
()
<=
1
)
return
op_set
;
std
::
vector
<
OpHandleBase
*>
ops
(
op_set
.
begin
(),
op_set
.
end
());
OpSet
ret
;
auto
rels
=
GetRelations
(
ops
);
auto
not_before
=
[](
RelationShip
r
)
{
return
r
!=
kBefore
;
};
for
(
size_t
i
=
0
;
i
<
rels
.
size
();
++
i
)
{
if
(
std
::
all_of
(
rels
[
i
].
begin
(),
rels
[
i
].
end
(),
not_before
))
{
ret
.
emplace
(
static_cast
<
KeyType
>
(
ops
[
i
]));
}
}
return
ret
;
}
private:
std
::
vector
<
std
::
vector
<
RelationShip
>>
GetRelations
(
const
std
::
vector
<
OpHandleBase
*>
&
ops
)
const
{
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
op_to_idx
;
for
(
size_t
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
PADDLE_ENFORCE
(
graph_
.
HasOp
(
ops
[
i
]),
"Op does not exist in graph"
);
op_to_idx
[
ops
[
i
]]
=
i
;
}
PADDLE_ENFORCE
(
op_to_idx
.
size
()
==
ops
.
size
(),
"Duplicate ops"
);
std
::
vector
<
std
::
vector
<
RelationShip
>>
ret
(
ops
.
size
());
for
(
auto
&
e
:
ret
)
{
e
.
assign
(
ops
.
size
(),
kSame
);
}
size_t
found_num
=
ops
.
size
();
size_t
total_num
=
ops
.
size
()
*
ops
.
size
();
auto
visitor
=
[
&
](
OpHandleBase
*
op
,
size_t
i
)
{
auto
it
=
op_to_idx
.
find
(
op
);
if
(
it
!=
op_to_idx
.
end
())
{
size_t
j
=
it
->
second
;
if
(
i
!=
j
&&
ret
[
i
][
j
]
==
kSame
)
{
ret
[
i
][
j
]
=
kBefore
;
ret
[
j
][
i
]
=
kAfter
;
found_num
+=
2
;
if
(
found_num
==
total_num
)
{
return
false
;
}
}
}
return
true
;
};
for
(
size_t
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
auto
sub_visitor
=
[
&
,
i
](
OpHandleBase
*
op
)
{
return
visitor
(
op
,
i
);
};
if
(
!
graph_
.
VisitAllPendingOps
(
ops
[
i
],
sub_visitor
))
{
break
;
}
}
for
(
size_t
i
=
0
;
i
<
ops
.
size
();
++
i
)
{
for
(
size_t
j
=
i
+
1
;
j
<
ops
.
size
();
++
j
)
{
if
(
ret
[
i
][
j
]
!=
kSame
)
continue
;
ret
[
i
][
j
]
=
kNoDeps
;
ret
[
j
][
i
]
=
kNoDeps
;
}
}
return
ret
;
}
const
OpGraphView
graph_
;
};
/**
* Find the nearest downstream computation op handle. If the op is a
* computation op, just return itself.
*/
static
ComputationOpHandle
*
FindNextComputationOpHandleOrReturnItself
(
OpHandleBase
*
op
,
size_t
scope_idx
)
{
std
::
queue
<
OpHandleBase
*>
q
;
std
::
unordered_set
<
OpHandleBase
*>
visited
;
q
.
push
(
op
);
do
{
auto
*
var
=
queue
.
front
();
queue
.
pop
();
for
(
auto
*
op
:
var
->
PendingOps
())
{
auto
*
op
=
q
.
front
();
q
.
pop
();
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetPlace
()
==
var_in
->
place_
)
{
if
(
compute_op
!=
nullptr
&&
compute_op
->
GetScopeIdx
()
==
scope_idx
)
{
return
compute_op
;
}
for
(
auto
*
out_var
:
op
->
Outputs
())
{
queue
.
push
(
out_var
);
for
(
auto
*
pending_op
:
out_var
->
PendingOps
())
{
if
(
visited
.
count
(
pending_op
))
continue
;
visited
.
insert
(
pending_op
);
}
}
}
while
(
!
q
ueue
.
empty
());
}
while
(
!
q
.
empty
());
return
nullptr
;
}
static
void
AddDependencyBetween
(
OpHandleBase
*
in
,
OpHandleBase
*
out
,
ir
::
Graph
*
graph
)
{
auto
it
=
std
::
find_if
(
in
->
Outputs
().
begin
(),
in
->
Outputs
().
end
(),
[](
VarHandleBase
*
var
)
{
return
dynamic_cast
<
DummyVarHandle
*>
(
var
)
!=
nullptr
;
});
if
(
it
!=
in
->
Outputs
().
end
())
{
out
->
AddInput
(
*
it
);
static
std
::
unordered_set
<
ComputationOpHandle
*>
ExtractComputationOpFromLastLivedVar
(
VarHandle
*
var
,
size_t
scope_idx
,
const
ShrinkDepsOpFunctor
&
shrink_func
,
bool
*
ok
)
{
// stage one. Get last op for variable.
std
::
unordered_set
<
OpHandleBase
*>
candidates
;
{
if
(
var
->
PendingOps
().
empty
()
&&
var
->
GeneratedOp
())
{
// No operator depends on this variable. So the last operator is the op
// who generates this variable.
candidates
.
emplace
(
var
->
GeneratedOp
());
}
else
{
auto
*
dep_var
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dep_var
);
in
->
AddOutput
(
dep_var
);
out
->
AddInput
(
dep_var
);
candidates
=
var
->
PendingOps
();
}
// No pending ops or generated op is nullptr
if
(
candidates
.
empty
())
{
*
ok
=
false
;
return
{};
}
}
// stage two. Try to cast them to computation op.
// return (*ok=false) when failed.
//
// The reason why we cannot make any types of op handle to be the last lived
// op is:
// some op handle may operate on many DeviceContext, however, our garbage
// collector can only wait one DeviceContext for now. So currently, we wait
// the nearest compute op.
std
::
unordered_set
<
ComputationOpHandle
*>
computation_op
;
{
for
(
auto
*
op
:
candidates
)
{
auto
*
compute_op
=
FindNextComputationOpHandleOrReturnItself
(
op
,
scope_idx
);
if
(
compute_op
==
nullptr
)
{
*
ok
=
false
;
return
{};
}
computation_op
.
emplace
(
compute_op
);
}
}
// stage three. Try to shrink computation op if they depend on each other.
// Get the smallest set of the most ops.
*
ok
=
true
;
return
shrink_func
(
computation_op
);
}
static
VarDesc
*
TryGetLatestVarDesc
(
const
std
::
vector
<
VarHandle
*>
&
vars
)
{
VarDesc
*
var_desc
=
nullptr
;
std
::
find_if
(
vars
.
rbegin
(),
vars
.
rend
(),
[
&
](
VarHandle
*
var_handle
)
->
bool
{
var_desc
=
var_handle
->
Node
()
->
Var
();
return
var_desc
!=
nullptr
;
});
return
var_desc
;
}
std
::
unique_ptr
<
ir
::
Graph
>
ReferenceCountPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
auto
&
ref_cnts
=
Get
<
DeviceReferenceCountMap
>
(
kGlobalReferenceCount
);
auto
&
cur_ref_cnts
=
Get
<
AtomicDeviceReferenceCountMap
>
(
kCurReferenceCount
);
auto
&
gcs
=
Get
<
DeviceGarbageCollectorMap
>
(
kGarbageCollector
);
// It is not easy to find the right reference counts of varaibles in graph
// Step 1: Find all variables in computation ops
// Step 2: Find all variables in non-computation ops which refers to variables
// in computation ops
std
::
unordered_set
<
std
::
string
>
names
;
std
::
unordered_map
<
OpHandleBase
*
,
ReferenceCountOpHandle
*>
compute_ref_cnt_map
;
auto
get_ref_cnts_from_compute_op
=
[
&
](
OpHandleBase
*
op
,
const
std
::
vector
<
VarHandleBase
*>
&
vars
)
{
std
::
vector
<
std
::
string
>
var_names_in_op
;
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
if
(
compute_op
==
nullptr
||
!
platform
::
is_gpu_place
(
compute_op
->
GetPlace
()))
return
var_names_in_op
;
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
compute_op
->
GetPlace
());
for
(
VarHandleBase
*
var_handle_base
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var_handle_base
);
if
(
var_handle
==
nullptr
||
!
var_handle
->
Node
()
->
IsVar
())
continue
;
if
(
!
platform
::
is_gpu_place
(
var_handle
->
place_
)
||
boost
::
get
<
platform
::
CUDAPlace
>
(
var_handle
->
place_
)
!=
place
)
continue
;
auto
&
ref_cnts
=
Get
<
std
::
vector
<
ReferenceCountMap
>>
(
kGlobalReferenceCount
);
auto
&
last_live_ops_of_vars
=
Get
<
std
::
vector
<
LastLiveOpsOfVars
>>
(
kLastLiveOpsOfVars
);
VarDesc
*
var_desc
=
var_handle
->
Node
()
->
Var
();
auto
var_name
=
var_handle
->
Node
()
->
Name
();
PADDLE_ENFORCE
(
last_live_ops_of_vars
.
empty
()
&&
ref_cnts
.
empty
(),
"Last Live Ops and Reference Counts of vars should be "
"initialized at here."
);
// This is weird but there is really some variables without var_desc
// in computation_op
if
(
var_desc
==
nullptr
)
{
var_desc
=
compute_op
->
Node
()
->
Op
()
->
Block
()
->
FindVar
(
var_name
);
if
(
var_desc
==
nullptr
)
continue
;
}
const
auto
&
vars
=
graph
->
Get
<
GraphVars
>
(
kGraphVars
);
if
(
var_desc
->
Persistable
())
continue
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
}
last_live_ops_of_vars
.
resize
(
vars
.
size
());
ref_cnts
.
resize
(
vars
.
size
());
// compute op only runs in one device
if
(
ref_cnts
[
place
.
device
]
->
count
(
var_name
))
++
(
*
ref_cnts
[
place
.
device
])[
var_name
];
else
(
*
ref_cnts
[
place
.
device
])[
var_name
]
=
1
;
ShrinkDepsOpFunctor
shrink_func
(
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
));
names
.
insert
(
var_name
);
var_names_in_op
.
push_back
(
var_name
);
}
return
var_names_in_op
;
}
;
for
(
size_t
i
=
0
;
i
<
vars
.
size
();
++
i
)
{
for
(
auto
&
name_var_pair
:
vars
[
i
])
{
// Whether this variable can be reused or deleted? If not, we do not
// compute reference counts and dependencies.
VarDesc
*
var_desc
=
TryGetLatestVarDesc
(
name_var_pair
.
second
)
;
auto
update_ref_cnts_from_non_compute_op
=
[
&
](
OpHandleBase
*
op
,
const
std
::
vector
<
VarHandleBase
*>
&
vars
)
{
if
(
dynamic_cast
<
ComputationOpHandle
*>
(
op
)
!=
nullptr
)
return
;
for
(
VarHandleBase
*
var_handle_base
:
vars
)
{
auto
*
var_handle
=
dynamic_cast
<
VarHandle
*>
(
var_handle_base
);
if
(
var_handle
==
nullptr
||
!
var_handle
->
Node
()
->
IsVar
())
continue
;
auto
var_name
=
var_handle
->
Node
()
->
Name
();
auto
var_place
=
var_handle
->
place_
;
if
(
!
platform
::
is_gpu_place
(
var_place
))
continue
;
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
var_place
);
if
(
names
.
count
(
var_name
)
==
0
)
continue
;
if
(
ref_cnts
.
count
(
place
.
device
)
&&
ref_cnts
[
place
.
device
]
->
count
(
var_name
))
{
++
(
*
ref_cnts
[
place
.
device
])[
var_name
];
auto
*
next_compute_op
=
FindNextComputationOpHandle
(
var_handle
);
if
(
next_compute_op
!=
nullptr
)
{
if
(
compute_ref_cnt_map
.
count
(
next_compute_op
))
{
compute_ref_cnt_map
[
next_compute_op
]
->
AddVar
(
var_name
);
VLOG
(
5
)
<<
"Add reference count of "
<<
var_name
<<
" to Operator "
<<
next_compute_op
->
Name
();
}
else
{
// Create new reference_count_op_handle
ir
::
Node
*
ref_cnt_node
=
graph
->
CreateEmptyNode
(
"reference_count"
,
ir
::
Node
::
Type
::
kOperation
);
auto
*
ref_cnt_handle
=
new
ReferenceCountOpHandle
(
ref_cnt_node
,
next_compute_op
->
GetScope
(),
place
,
{
var_name
},
gcs
[
place
.
device
].
get
(),
cur_ref_cnts
[
place
.
device
].
get
());
AddDependencyBetween
(
next_compute_op
,
ref_cnt_handle
,
graph
.
get
());
compute_ref_cnt_map
[
next_compute_op
]
=
ref_cnt_handle
;
}
}
}
if
(
var_desc
==
nullptr
||
var_desc
->
Persistable
())
{
continue
;
}
};
auto
all_ops
=
ir
::
FilterByNodeWrapper
<
OpHandleBase
>
(
*
graph
);
for
(
auto
&
op
:
all_ops
)
{
auto
in_var_names
=
get_ref_cnts_from_compute_op
(
op
,
op
->
Inputs
());
auto
out_var_names
=
get_ref_cnts_from_compute_op
(
op
,
op
->
Outputs
());
if
(
in_var_names
.
empty
()
&&
out_var_names
.
empty
())
continue
;
in_var_names
.
insert
(
in_var_names
.
end
(),
out_var_names
.
begin
(),
out_var_names
.
end
());
auto
*
compute_op
=
dynamic_cast
<
ComputationOpHandle
*>
(
op
);
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
compute_op
->
GetPlace
());
ir
::
Node
*
ref_cnt_node
=
graph
->
CreateEmptyNode
(
"reference_count"
,
ir
::
Node
::
Type
::
kOperation
);
auto
*
ref_cnt_handle
=
new
ReferenceCountOpHandle
(
ref_cnt_node
,
compute_op
->
GetScope
(),
place
,
in_var_names
,
gcs
[
place
.
device
].
get
(),
cur_ref_cnts
[
place
.
device
].
get
());
AddDependencyBetween
(
compute_op
,
ref_cnt_handle
,
graph
.
get
());
compute_ref_cnt_map
[
compute_op
]
=
ref_cnt_handle
;
auto
var_type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
var_type
!=
proto
::
VarType
::
LOD_TENSOR
&&
var_type
!=
proto
::
VarType
::
SELECTED_ROWS
&&
var_type
!=
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
// Var type cannot be deleted
continue
;
}
for
(
auto
&
op
:
all_ops
)
{
update_ref_cnts_from_non_compute_op
(
op
,
op
->
Inputs
());
update_ref_cnts_from_non_compute_op
(
op
,
op
->
Outputs
());
}
bool
ok
;
auto
result
=
ExtractComputationOpFromLastLivedVar
(
name_var_pair
.
second
.
back
(),
i
,
shrink_func
,
&
ok
);
std
::
vector
<
OpHandleBase
*>
new_all_ops
;
new_all_ops
.
reserve
(
compute_ref_cnt_map
.
size
()
+
all_ops
.
size
());
for
(
auto
&
op
:
all_ops
)
{
new_all_ops
.
emplace_back
(
std
::
move
(
op
));
auto
it
=
compute_ref_cnt_map
.
find
(
new_all_ops
.
back
());
if
(
it
!=
compute_ref_cnt_map
.
end
())
{
// Add LeafNode to ReferenceCountOpHandle
auto
*
dummy_leaf
=
new
DummyVarHandle
(
graph
->
CreateControlDepVar
());
graph
->
Get
<
GraphDepVars
>
(
kGraphDepVars
).
emplace
(
dummy_leaf
);
it
->
second
->
AddOutput
(
dummy_leaf
);
new_all_ops
.
emplace_back
(
std
::
move
(
it
->
second
));
if
(
ok
)
{
auto
&
var_name
=
name_var_pair
.
first
;
PADDLE_ENFORCE
(
!
result
.
empty
(),
"Last living ops of %s cannot be empty"
,
var_name
);
ref_cnts
[
i
].
emplace
(
var_name
,
result
.
size
());
last_live_ops_of_vars
[
i
].
emplace
(
var_name
,
std
::
move
(
result
));
}
}
}
all_ops
.
swap
(
new_all_ops
);
return
graph
;
}
...
...
@@ -205,5 +258,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
REGISTER_PASS
(
reference_count_pass
,
paddle
::
framework
::
details
::
ReferenceCountPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kGlobalReferenceCount
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kCurReferenceCount
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kGarbageCollector
);
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kLastLiveOpsOfVars
);
paddle/fluid/framework/details/reference_count_pass.h
浏览文件 @
740e1626
...
...
@@ -14,7 +14,6 @@
#pragma once
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
...
...
@@ -22,10 +21,6 @@ namespace paddle {
namespace
framework
{
namespace
details
{
constexpr
char
kGlobalReferenceCount
[]
=
"reference_count"
;
constexpr
char
kCurReferenceCount
[]
=
"current_reference_count"
;
constexpr
char
kGarbageCollector
[]
=
"garbage_collector"
;
class
ReferenceCountPass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
...
...
paddle/fluid/framework/details/reference_count_pass_helper.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/reference_count_pass_helper.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <atomic>
#include <map>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/garbage_collector.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
ComputationOpHandle
;
using
ReferenceCountMap
=
std
::
unordered_map
<
std
::
string
,
size_t
>
;
using
AtomicReferenceCountMap
=
std
::
unordered_map
<
std
::
string
,
std
::
atomic
<
size_t
>>
;
using
GarbageCollectorMap
=
std
::
map
<
platform
::
Place
,
std
::
unique_ptr
<
GarbageCollector
>>
;
const
char
kGlobalReferenceCount
[]
=
"global_reference_count"
;
const
char
kRuntimeReferenceCount
[]
=
"runtime_reference_count"
;
const
char
kGarbageCollector
[]
=
"garbage_collector"
;
const
char
kAllPlaces
[]
=
"all_places"
;
using
LastLiveOpsOfVars
=
std
::
unordered_map
<
std
::
string
,
std
::
unordered_set
<
ComputationOpHandle
*>>
;
const
char
kLastLiveOpsOfVars
[]
=
"last_live_ops_of_var"
;
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
浏览文件 @
740e1626
...
...
@@ -18,9 +18,6 @@
#include <vector>
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/reference_count_op_handle.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
platform
::
RecordEvent
e
(
"ScopeBufferedSSAGraphExecutorAfterRun"
,
nullptr
);
drop_scope_counter_
+=
1
;
#ifdef PADDLE_WITH_CUDA
const
std
::
string
gc_name
=
"garbage_collector"
;
DeviceGarbageCollectorMap
*
gc
=
Graph
().
Has
(
gc_name
)
?
&
(
Graph
().
Get
<
DeviceGarbageCollectorMap
>
(
gc_name
))
:
nullptr
;
#endif
if
(
!
fetch_tensors
.
empty
()
||
drop_scope_counter_
==
strategy_
.
num_iteration_per_drop_scope_
)
{
drop_scope_counter_
=
0
;
// Wait All computational streams
for
(
auto
p
:
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
#ifdef PADDLE_WITH_CUDA
if
(
gc
!=
nullptr
&&
platform
::
is_gpu_place
(
p
))
{
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
p
);
auto
&
gc_at_place
=
gc
->
at
(
gpu_place
.
device
);
gc_at_place
->
Wait
();
gc_at_place
->
Reset
();
}
#endif
}
for
(
auto
&
scope
:
local_scopes_
)
{
auto
&
local_scope
=
...
...
paddle/fluid/framework/executor.cc
浏览文件 @
740e1626
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/executor.h"
#include <deque>
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/lod_rank_table.h"
...
...
@@ -41,11 +42,43 @@ namespace {
int
kProgramId
=
-
1
;
}
// namespace
static
std
::
unordered_map
<
std
::
string
,
size_t
>
GetNonPersistableReferenceCounts
(
const
BlockDesc
&
block
,
const
std
::
vector
<
std
::
string
>&
skip_var_list
)
{
std
::
unordered_map
<
std
::
string
,
size_t
>
ref_cnts
;
std
::
unordered_set
<
std
::
string
>
skip_vars
(
skip_var_list
.
begin
(),
skip_var_list
.
end
());
auto
update_ref_cnts
=
[
&
](
OpDesc
*
op_desc
,
const
VariableNameMap
&
name_map
)
{
for
(
auto
&
name_pair
:
name_map
)
{
for
(
auto
&
name
:
name_pair
.
second
)
{
if
(
skip_vars
.
count
(
name
))
continue
;
auto
*
var_desc
=
block
.
FindVar
(
name
);
if
(
var_desc
==
nullptr
||
var_desc
->
Persistable
())
continue
;
auto
type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
type
!=
proto
::
VarType
::
LOD_TENSOR
&&
type
!=
proto
::
VarType
::
SELECTED_ROWS
&&
type
!=
proto
::
VarType
::
LOD_TENSOR_ARRAY
)
{
continue
;
}
++
ref_cnts
[
name
];
}
}
};
for
(
auto
op_desc
:
block
.
AllOps
())
{
update_ref_cnts
(
op_desc
,
op_desc
->
Inputs
());
update_ref_cnts
(
op_desc
,
op_desc
->
Outputs
());
}
return
ref_cnts
;
}
ExecutorPrepareContext
::
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
)
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
,
const
std
::
vector
<
std
::
string
>&
skip_ref_cnt_vars
)
:
prog_
(
prog
),
block_id_
(
block_id
)
{
if
(
GetEagerDeletionThreshold
()
>=
0
)
{
ref_cnts_
=
GetNonPersistableReferenceCount
<
int
>
(
prog_
,
block_id_
);
global_ref_cnts_
=
GetNonPersistableReferenceCounts
(
prog
.
Block
(
block_id
),
skip_ref_cnt_vars
);
}
}
...
...
@@ -53,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
VLOG
(
5
)
<<
"destroy ExecutorPrepareContext"
;
}
template
<
typename
RefCntMap
>
static
void
DeleteUnusedTensors
(
const
Scope
&
scope
,
const
OperatorBase
*
op
,
GarbageCollector
<
Tensor
>*
gc
,
RefCntMap
*
ref_cnts
)
{
std
::
unordered_set
<
Tensor
*>
erase_tensors
;
static
void
DeleteUnusedTensors
(
const
Scope
&
scope
,
const
OperatorBase
*
op
,
GarbageCollector
*
gc
,
std
::
unordered_map
<
std
::
string
,
size_t
>*
ref_cnts
)
{
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
garbages
;
auto
handler
=
[
&
](
const
VariableNameMap
&
name_map
)
{
for
(
auto
&
name_pair
:
name_map
)
{
for
(
auto
&
name
:
name_pair
.
second
)
{
auto
it
=
ref_cnts
->
find
(
name
);
if
(
it
==
ref_cnts
->
end
())
continue
;
if
((
it
->
second
)
--
==
1
)
{
if
(
--
(
it
->
second
)
!=
0
)
{
continue
;
}
auto
*
var
=
scope
.
FindVar
(
name
);
if
(
var
!=
nullptr
)
{
VLOG
(
10
)
<<
"Erase tensor
\'
"
<<
name
<<
"
\'
"
;
continue
;
}
VLOG
(
2
)
<<
"Erase variable "
<<
name
;
if
(
var
->
IsType
<
LoDTensor
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
LoDTensor
>
());
garbages
.
emplace_back
(
var
->
GetMutable
<
LoDTensor
>
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
erase_tensors
.
insert
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
());
}
garbages
.
emplace_back
(
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
()
->
MoveMemoryHolder
());
}
else
if
(
var
->
IsType
<
LoDTensorArray
>
())
{
auto
*
lod_tensor_arr
=
var
->
GetMutable
<
LoDTensorArray
>
();
for
(
auto
&
t
:
*
lod_tensor_arr
)
{
garbages
.
emplace_back
(
t
.
MoveMemoryHolder
());
}
}
else
{
PADDLE_THROW
(
"Type %s of %s is not supported eager deletion"
,
var
->
Type
().
name
(),
name
);
}
}
}
...
...
@@ -83,8 +128,8 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
handler
(
op
->
Inputs
());
handler
(
op
->
Outputs
());
if
(
!
erase_tensor
s
.
empty
())
{
gc
->
Add
(
erase_tensors
);
if
(
!
garbage
s
.
empty
())
{
gc
->
Add
(
std
::
move
(
garbages
)
);
}
}
...
...
@@ -112,9 +157,9 @@ void Executor::Close() {
#ifdef PADDLE_WITH_DISTRIBUTE
// TODO(typhoonzero): complete message will need to use real trainer_id,
// except 0.
::
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
::
paddle
::
operators
::
distributed
::
GRPCClient
>
(
0
)
->
SendComplete
();
auto
client
=
paddle
::
operators
::
distributed
::
RPCClient
::
GetInstance
<
RPCCLIENT_T
>
(
0
);
client
->
SendComplete
();
#endif
}
...
...
@@ -325,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
}
std
::
unique_ptr
<
ExecutorPrepareContext
>
Executor
::
Prepare
(
const
ProgramDesc
&
program
,
int
block_id
)
{
const
ProgramDesc
&
program
,
int
block_id
,
const
std
::
vector
<
std
::
string
>&
skip_ref_cnt_vars
)
{
std
::
unique_ptr
<
ExecutorPrepareContext
>
ctx
(
new
ExecutorPrepareContext
(
program
,
block_id
));
new
ExecutorPrepareContext
(
program
,
block_id
,
skip_ref_cnt_vars
));
PADDLE_ENFORCE_LT
(
static_cast
<
size_t
>
(
block_id
),
program
.
Size
());
auto
&
block
=
program
.
Block
(
block_id
);
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
...
...
@@ -338,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
}
std
::
vector
<
std
::
shared_ptr
<
ExecutorPrepareContext
>>
Executor
::
Prepare
(
const
ProgramDesc
&
program
,
const
std
::
vector
<
int
>&
block_ids
)
{
const
ProgramDesc
&
program
,
const
std
::
vector
<
int
>&
block_ids
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
skip_ref_cnt_vars
)
{
PADDLE_ENFORCE
(
skip_ref_cnt_vars
.
empty
()
||
skip_ref_cnt_vars
.
size
()
==
block_ids
.
size
(),
"skip_ref_cnt_vars should be either empty or equals to block number %d"
,
block_ids
.
size
());
std
::
vector
<
std
::
shared_ptr
<
ExecutorPrepareContext
>>
result
;
size_t
idx
=
0
;
for
(
auto
&
bid
:
block_ids
)
{
auto
*
ctx
=
new
ExecutorPrepareContext
(
program
,
bid
);
ExecutorPrepareContext
*
ctx
;
if
(
skip_ref_cnt_vars
.
empty
())
{
ctx
=
new
ExecutorPrepareContext
(
program
,
bid
);
}
else
{
ctx
=
new
ExecutorPrepareContext
(
program
,
bid
,
skip_ref_cnt_vars
[
idx
]);
}
PADDLE_ENFORCE_LT
(
static_cast
<
size_t
>
(
bid
),
program
.
Size
());
auto
&
block
=
program
.
Block
(
bid
);
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
ctx
->
ops_
.
push_back
(
OpRegistry
::
CreateOp
(
*
op_desc
));
}
result
.
push_back
(
std
::
shared_ptr
<
ExecutorPrepareContext
>
(
ctx
));
++
idx
;
}
return
result
;
}
...
...
@@ -365,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
}
int64_t
max_memory_size
=
GetEagerDeletionThreshold
();
std
::
unique_ptr
<
GarbageCollector
<
Tensor
>>
gc
;
// WhileOp would set keep_kids to true,
// because WhileGradOp needs the scopes created in WhileOp.
// Perhaps, we should not perform eager deletion in WhileOp
// The scopes and variables created by WhileOp would be deleted
// in WhileGradOp.
std
::
unique_ptr
<
GarbageCollector
>
gc
;
// skip while_op and while_grad_op temporarily
if
(
max_memory_size
>=
0
&&
!
keep_kids
)
{
ctx
->
ResetReferenceCount
();
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place_
))
{
gc
.
reset
(
new
DefaultStreamGarbageCollector
<
Tensor
>
(
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
max_memory_size
));
}
else
{
gc
.
reset
(
new
DefaultStreamGarbageCollector
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
max_memory_size
));
}
}
else
if
(
platform
::
is_cpu_place
(
place_
))
{
#endif
gc
.
reset
(
new
CPUGarbageCollector
<
Tensor
>
(
boost
::
get
<
platform
::
CPUPlace
>
(
place_
),
max_memory_size
));
gc
.
reset
(
new
CPUGarbageCollector
(
boost
::
get
<
platform
::
CPUPlace
>
(
place_
),
max_memory_size
));
#ifdef PADDLE_WITH_CUDA
}
#endif
...
...
@@ -389,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
for
(
auto
&
op
:
ctx
->
ops_
)
{
op
->
Run
(
*
local_scope
,
place_
);
if
(
gc
!=
nullptr
)
{
if
(
gc
)
{
DeleteUnusedTensors
(
*
local_scope
,
op
.
get
(),
gc
.
get
(),
&
(
ctx
->
cur
_ref_cnts_
));
&
(
ctx
->
runtime
_ref_cnts_
));
}
}
if
(
gc
!=
nullptr
)
{
gc
->
Wait
();
}
else
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
)
->
Wait
();
}
if
(
local_scope
!=
scope
)
{
scope
->
DeleteScope
(
local_scope
);
...
...
paddle/fluid/framework/executor.h
浏览文件 @
740e1626
...
...
@@ -27,52 +27,21 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
template
<
typename
T
>
std
::
unordered_map
<
std
::
string
,
T
>
GetNonPersistableReferenceCount
(
const
ProgramDesc
&
prog
,
size_t
block_id
)
{
auto
&
block
=
prog
.
Block
(
block_id
);
std
::
unordered_map
<
std
::
string
,
T
>
ref_cnts
;
auto
update_ref_cnts
=
[
&
](
OpDesc
*
op_desc
,
const
VariableNameMap
&
name_map
)
{
for
(
auto
&
name_pair
:
name_map
)
{
for
(
auto
&
name
:
name_pair
.
second
)
{
auto
*
var_desc
=
block
.
FindVar
(
name
);
if
(
var_desc
==
nullptr
||
var_desc
->
Persistable
())
continue
;
auto
type
=
var_desc
->
Proto
()
->
type
().
type
();
if
(
type
!=
proto
::
VarType
::
LOD_TENSOR
&&
type
!=
proto
::
VarType
::
SELECTED_ROWS
)
{
continue
;
}
auto
it
=
ref_cnts
.
find
(
name
);
if
(
it
!=
ref_cnts
.
end
())
{
++
it
->
second
;
}
else
{
ref_cnts
[
name
]
=
1
;
}
}
}
};
for
(
auto
op_desc
:
block
.
AllOps
())
{
update_ref_cnts
(
op_desc
,
op_desc
->
Inputs
());
update_ref_cnts
(
op_desc
,
op_desc
->
Outputs
());
}
return
ref_cnts
;
}
struct
ExecutorPrepareContext
{
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
);
ExecutorPrepareContext
(
const
framework
::
ProgramDesc
&
prog
,
size_t
block_id
,
const
std
::
vector
<
std
::
string
>&
skip_ref_cnt_vars
=
std
::
vector
<
std
::
string
>
());
~
ExecutorPrepareContext
();
void
ResetReferenceCount
()
{
cur_ref_cnts_
=
ref_cnts_
;
}
void
ResetReferenceCount
()
{
runtime_ref_cnts_
=
global_
ref_cnts_
;
}
const
framework
::
ProgramDesc
&
prog_
;
size_t
block_id_
;
std
::
vector
<
std
::
unique_ptr
<
OperatorBase
>>
ops_
;
std
::
unordered_map
<
std
::
string
,
int
>
ref_cnts_
;
std
::
unordered_map
<
std
::
string
,
int
>
cur
_ref_cnts_
;
std
::
unordered_map
<
std
::
string
,
size_t
>
global_
ref_cnts_
;
std
::
unordered_map
<
std
::
string
,
size_t
>
runtime
_ref_cnts_
;
};
class
Executor
{
...
...
@@ -108,10 +77,14 @@ class Executor {
const
std
::
string
&
fetch_holder_name
=
"fetch"
);
static
std
::
unique_ptr
<
ExecutorPrepareContext
>
Prepare
(
const
ProgramDesc
&
program
,
int
block_id
);
const
ProgramDesc
&
program
,
int
block_id
,
const
std
::
vector
<
std
::
string
>&
skip_ref_cnt_vars
=
std
::
vector
<
std
::
string
>
());
static
std
::
vector
<
std
::
shared_ptr
<
ExecutorPrepareContext
>>
Prepare
(
const
ProgramDesc
&
program
,
const
std
::
vector
<
int
>&
block_ids
);
const
ProgramDesc
&
program
,
const
std
::
vector
<
int
>&
block_ids
,
const
std
::
vector
<
std
::
vector
<
std
::
string
>>&
skip_ref_cnt_vars
=
std
::
vector
<
std
::
vector
<
std
::
string
>>
());
void
CreateVariables
(
const
ProgramDesc
&
pdesc
,
Scope
*
scope
,
int
block_id
);
...
...
paddle/fluid/framework/executor_thread_worker.cc
浏览文件 @
740e1626
...
...
@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace
paddle
{
...
...
@@ -154,6 +155,8 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
}
void
ExecutorThreadWorker
::
TrainFiles
()
{
platform
::
SetNumThreads
(
1
);
// todo: configurable
SetDevice
();
...
...
paddle/fluid/framework/garbage_collector.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/framework/garbage_collector.h"
namespace
paddle
{
namespace
framework
{
GarbageCollector
::
GarbageCollector
(
const
platform
::
Place
&
place
,
size_t
max_memory_size
)
:
max_memory_size_
((
std
::
max
)(
max_memory_size
,
static_cast
<
size_t
>
(
1
)))
{
garbages_
.
reset
(
new
GarbageQueue
());
dev_ctx_
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
}
CPUGarbageCollector
::
CPUGarbageCollector
(
const
platform
::
CPUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
CPUGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
}
#ifdef PADDLE_WITH_CUDA
UnsafeFastGPUGarbageCollector
::
UnsafeFastGPUGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
UnsafeFastGPUGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback
();
}
DefaultStreamGarbageCollector
::
DefaultStreamGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{}
void
DefaultStreamGarbageCollector
::
Wait
()
const
{
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctx_
)
->
WaitStreamCallback
();
}
void
DefaultStreamGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctx_
)
->
AddStreamCallback
(
callback
);
}
StreamGarbageCollector
::
StreamGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
(
place
,
max_memory_size
)
{
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
callback_manager_
.
reset
(
new
platform
::
StreamCallbackManager
(
stream_
));
}
StreamGarbageCollector
::~
StreamGarbageCollector
()
{
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
this
->
dev_ctx_
->
GetPlace
());
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream_
));
}
cudaStream_t
StreamGarbageCollector
::
stream
()
const
{
return
stream_
;
}
void
StreamGarbageCollector
::
Wait
()
const
{
callback_manager_
->
Wait
();
}
void
StreamGarbageCollector
::
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
{
callback_manager_
->
AddCallback
(
callback
);
}
#endif
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/garbage_collector.h
浏览文件 @
740e1626
...
...
@@ -14,7 +14,6 @@
#pragma once
#include <algorithm>
#include <deque>
#include <functional>
#include <memory>
...
...
@@ -24,134 +23,74 @@
namespace
paddle
{
namespace
framework
{
// T should have memory_size() and clear() method
template
<
typename
T
>
class
GarbageCollector
{
public:
GarbageCollector
(
const
platform
::
Place
&
place
,
size_t
max_memory_size
)
:
max_memory_size_
((
std
::
max
)(
max_memory_size
,
static_cast
<
size_t
>
(
1
)))
{
garbages_
.
reset
(
new
std
::
deque
<
T
*>
());
dev_ctx_
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
}
using
GarbageQueue
=
std
::
deque
<
std
::
shared_ptr
<
memory
::
Allocation
>>
;
virtual
~
GarbageCollector
()
{}
GarbageCollector
(
const
platform
::
Place
&
place
,
size_t
max_memory_size
);
void
Reset
()
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mutex_
);
garbages_
.
reset
(
new
std
::
deque
<
T
*>
());
cur_memory_size_
=
0
;
}
virtual
~
GarbageCollector
()
=
default
;
virtual
void
Wait
()
const
{}
template
<
typename
Container
>
void
Add
(
const
Container
&
objs
)
{
Add
(
objs
,
[]()
{});
}
void
Add
(
Container
&&
objs
);
template
<
typename
Container
,
typename
Callback
>
void
Add
(
const
Container
&
objs
,
Callback
&&
callback
)
{
std
::
shared_ptr
<
std
::
deque
<
T
*>>
clear_deque
;
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mutex_
);
for
(
auto
*
obj
:
objs
)
{
garbages_
->
push_back
(
obj
);
cur_memory_size_
+=
obj
->
memory_size
();
}
if
(
cur_memory_size_
>=
max_memory_size_
)
{
cur_memory_size_
=
0
;
clear_deque
=
garbages_
;
garbages_
.
reset
(
new
std
::
deque
<
T
*>
());
}
}
if
(
clear_deque
!=
nullptr
)
{
callback
();
ClearCallback
([
=
]()
{
for
(
auto
*
obj
:
*
clear_deque
)
obj
->
clear
();
});
}
}
virtual
void
Wait
()
const
{}
void
Add
(
Container
&&
objs
,
Callback
&&
callback
);
protected:
virtual
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
=
0
;
platform
::
DeviceContext
*
dev_ctx_
;
std
::
shared_ptr
<
std
::
deque
<
T
*>
>
garbages_
;
std
::
unique_ptr
<
GarbageQueue
>
garbages_
;
mutable
std
::
mutex
mutex_
;
const
size_t
max_memory_size_
;
size_t
cur_memory_size_
=
0
;
size_t
cur_memory_size_
{
0
}
;
};
template
<
typename
T
>
class
CPUGarbageCollector
:
public
GarbageCollector
<
T
>
{
class
CPUGarbageCollector
:
public
GarbageCollector
{
public:
CPUGarbageCollector
(
const
platform
::
CPUPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
<
T
>
(
place
,
max_memory_size
)
{}
CPUGarbageCollector
(
const
platform
::
CPUPlace
&
place
,
size_t
max_memory_size
);
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
{
callback
();
}
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
#ifdef PADDLE_WITH_CUDA
template
<
typename
T
>
class
DefaultStreamGarbageCollector
:
public
GarbageCollector
<
T
>
{
class
UnsafeFastGPUGarbageCollector
:
public
GarbageCollector
{
public:
DefaultStreamGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
<
T
>
(
place
,
max_memory_size
)
{}
UnsafeFastGPUGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
);
cudaStream_t
stream
()
const
{
return
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctx_
)
->
stream
();
}
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
void
Wait
()
const
override
{
this
->
dev_ctx_
->
Wait
();
static_cast
<
const
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctx_
)
->
WaitStreamCallback
();
}
class
DefaultStreamGarbageCollector
:
public
GarbageCollector
{
public:
DefaultStreamGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
);
void
Wait
()
const
override
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
{
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctx_
)
->
AddStreamCallback
(
callback
);
}
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
};
template
<
typename
T
>
class
StreamGarbageCollector
:
public
GarbageCollector
<
T
>
{
class
StreamGarbageCollector
:
public
GarbageCollector
{
public:
StreamGarbageCollector
(
const
platform
::
CUDAPlace
&
place
,
size_t
max_memory_size
)
:
GarbageCollector
<
T
>
(
place
,
max_memory_size
)
{
PADDLE_ENFORCE
(
cudaSetDevice
(
place
.
device
));
PADDLE_ENFORCE
(
cudaStreamCreate
(
&
stream_
));
callback_manager_
.
reset
(
new
platform
::
StreamCallbackManager
(
stream_
));
}
size_t
max_memory_size
);
~
StreamGarbageCollector
()
{
auto
place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
this
->
dev_ctx_
->
GetPlace
());
PADDLE_ENFORCE
(
cudaSetDevice
(
place
.
device
));
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
PADDLE_ENFORCE
(
cudaStreamDestroy
(
stream_
));
}
~
StreamGarbageCollector
();
void
Wait
()
const
override
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
std
::
lock_guard
<
std
::
mutex
>
guard
(
this
->
mutex_
);
callback_manager_
->
Wait
();
}
void
Wait
()
const
override
;
cudaStream_t
stream
()
const
{
return
stream_
;
}
cudaStream_t
stream
()
const
;
protected:
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
this
->
mutex_
);
callback_manager_
->
AddCallback
(
callback
);
}
void
ClearCallback
(
const
std
::
function
<
void
()
>
&
callback
)
override
;
private:
cudaStream_t
stream_
;
...
...
@@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
};
#endif
template
<
typename
Container
>
void
GarbageCollector
::
Add
(
Container
&&
objs
)
{
Add
(
std
::
forward
<
Container
>
(
objs
),
[]()
{});
}
template
<
typename
Container
,
typename
Callback
>
void
GarbageCollector
::
Add
(
Container
&&
objs
,
Callback
&&
callback
)
{
GarbageQueue
*
garbage_queue
=
nullptr
;
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
mutex_
);
for
(
auto
&
obj
:
objs
)
{
if
(
!
obj
)
continue
;
cur_memory_size_
+=
obj
->
size
();
garbages_
->
push_back
(
std
::
move
(
obj
));
}
if
(
cur_memory_size_
>=
max_memory_size_
)
{
cur_memory_size_
=
0
;
garbage_queue
=
garbages_
.
release
();
garbages_
.
reset
(
new
GarbageQueue
());
}
}
if
(
garbage_queue
)
{
callback
();
ClearCallback
([
garbage_queue
]()
{
delete
garbage_queue
;
});
}
}
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base)
pass_library
(
conv_bn_fuse_pass inference
)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
pass_library
(
is_test_pass base
)
pass_library
(
conv_elementwise_add_act_fuse_pass inference
)
pass_library
(
conv_elementwise_add2_act_fuse_pass inference
)
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
...
...
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(conv_op); \
GET_IR_NODE(conv_out); \
GET_IR_NODE(conv_filter); \
GET_IR_NODE(elementwise_add_op); \
GET_IR_NODE(elementwise_add_in_y); \
GET_IR_NODE(elementwise_add_out); \
GET_IR_NODE(elementwise_add_op_1); \
GET_IR_NODE(elementwise_add_in_y_1); \
GET_IR_NODE(elementwise_add_out_1); \
GET_IR_NODE(act_op); \
GET_IR_NODE(act_out);
// Inherient the basic infomation from `base_desc`, and modify some fields.
framework
::
proto
::
OpDesc
PrepareOpDesc
(
const
framework
::
proto
::
OpDesc
&
base_desc
,
const
std
::
string
&
bias
,
const
std
::
string
&
bias1
,
const
std
::
string
&
activation
,
const
std
::
string
&
output
)
{
auto
proto
=
base_desc
;
framework
::
OpDesc
desc
(
proto
,
nullptr
);
desc
.
SetInput
(
"Bias"
,
{
bias
});
desc
.
SetInput
(
"ResidualData"
,
{
bias1
});
desc
.
SetAttr
(
"activation"
,
activation
);
desc
.
SetOutput
(
"Output"
,
{
output
});
desc
.
SetAttr
(
"is_test"
,
true
);
desc
.
SetAttr
(
"use_cudnn"
,
false
);
return
*
desc
.
Proto
();
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvElementwiseAddActFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"conv_elementwise_add_act_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
)
->
AsInput
()
->
assert_is_op_input
(
"conv2d"
,
"Input"
);
patterns
::
ConvElementwiseaddAct
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
x
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
GET_NODES
;
auto
base_op_desc
=
*
conv_op
->
Op
()
->
Proto
();
std
::
string
bias_name
=
elementwise_add_in_y
->
Name
();
std
::
string
bias1_name
=
elementwise_add_in_y_1
->
Name
();
std
::
string
act_op_type
=
act_op
->
Op
()
->
Type
();
std
::
string
act_op_out
=
act_out
->
Name
();
auto
new_op_proto
=
PrepareOpDesc
(
base_op_desc
,
bias_name
,
bias1_name
,
act_op_type
,
act_op_out
);
framework
::
OpDesc
new_op_desc
(
new_op_proto
,
nullptr
);
// Create a new node for the fused op.
auto
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
// Link inputs and outputs.
PADDLE_ENFORCE
(
subgraph
.
count
(
x
));
auto
*
conv_in_node
=
subgraph
.
at
(
x
);
IR_NODE_LINK_TO
(
conv_in_node
,
new_conv_op
);
// Input
IR_NODE_LINK_TO
(
conv_filter
,
new_conv_op
);
// Filter
IR_NODE_LINK_TO
(
elementwise_add_in_y
,
new_conv_op
);
// Bias
IR_NODE_LINK_TO
(
elementwise_add_in_y_1
,
new_conv_op
);
// ResidualData
IR_NODE_LINK_TO
(
new_conv_op
,
act_out
);
// Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
conv_op
,
elementwise_add_op
,
elementwise_add_op_1
,
elementwise_add_out
});
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
conv_elementwise_add2_act_fuse_pass
,
paddle
::
framework
::
ir
::
ConvElementwiseAdd2ActFusePass
);
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
#include <string>
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(conv_op); \
GET_IR_NODE(conv_out); \
GET_IR_NODE(conv_filter); \
GET_IR_NODE(elementwise_add_op); \
GET_IR_NODE(elementwise_add_in_y); \
GET_IR_NODE(elementwise_add_out); \
GET_IR_NODE(elementwise_add_op_1); \
GET_IR_NODE(elementwise_add_in_y_1); \
GET_IR_NODE(elementwise_add_out_1); \
GET_IR_NODE(act_op); \
GET_IR_NODE(act_out);
// Inherient the basic infomation from `base_desc`, and modify some fields.
framework
::
proto
::
OpDesc
PrepareOpDesc
(
const
framework
::
proto
::
OpDesc
&
base_desc
,
const
std
::
string
&
bias
,
const
std
::
string
&
bias1
,
const
std
::
string
&
activation
,
const
std
::
string
&
output
)
{
auto
proto
=
base_desc
;
framework
::
OpDesc
desc
(
proto
,
nullptr
);
desc
.
SetInput
(
"Bias"
,
{
bias
});
desc
.
SetInput
(
"ResidualData"
,
{
bias1
});
desc
.
SetAttr
(
"activation"
,
activation
);
desc
.
SetOutput
(
"Output"
,
{
output
});
desc
.
SetAttr
(
"is_test"
,
true
);
return
*
desc
.
Proto
();
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvElementwiseAdd2ActFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"conv_elementwise_add_act_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
)
->
AsInput
()
->
assert_is_op_input
(
"conv2d"
,
"Input"
);
patterns
::
ConvElementwiseadd2Act
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
x
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
GET_NODES
;
auto
base_op_desc
=
*
conv_op
->
Op
()
->
Proto
();
std
::
string
bias_name
=
elementwise_add_in_y
->
Name
();
std
::
string
bias1_name
=
elementwise_add_in_y_1
->
Name
();
std
::
string
act_op_type
=
act_op
->
Op
()
->
Type
();
std
::
string
act_op_out
=
act_out
->
Name
();
auto
new_op_proto
=
PrepareOpDesc
(
base_op_desc
,
bias_name
,
bias1_name
,
act_op_type
,
act_op_out
);
framework
::
OpDesc
new_op_desc
(
new_op_proto
,
nullptr
);
// Create a new node for the fused op.
graph
->
CreateOpNode
(
&
new_op_desc
);
// Link inputs and outputs.
PADDLE_ENFORCE
(
subgraph
.
count
(
x
));
auto
*
conv_in_node
=
subgraph
.
at
(
x
);
IR_NODE_LINK_TO
(
conv_in_node
,
conv_op
);
// Input
IR_NODE_LINK_TO
(
conv_filter
,
conv_op
);
// Filter
IR_NODE_LINK_TO
(
conv_op
,
conv_out
);
// Output
IR_NODE_LINK_TO
(
elementwise_add_in_y
,
conv_op
);
// Bias
IR_NODE_LINK_TO
(
elementwise_add_in_y_1
,
conv_op
);
// Bias
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
conv_op
,
elementwise_add_op
,
elementwise_add_op_1
,
elementwise_add_out
});
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
conv_elementwise_add2_act_fuse_pass
,
paddle
::
framework
::
ir
::
ConvElementwiseAdd2ActFusePass
);
paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
ConvElementwiseAdd2ActFusePass
:
public
FusePassBase
{
public:
virtual
~
ConvElementwiseAdd2ActFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(conv_op); \
GET_IR_NODE(conv_out); \
GET_IR_NODE(conv_filter); \
GET_IR_NODE(elementwise_add_op); \
GET_IR_NODE(elementwise_add_in_y); \
GET_IR_NODE(elementwise_add_out); \
GET_IR_NODE(act_op); \
GET_IR_NODE(act_out);
// Inherient the basic infomation from `base_desc`, and modify some fields.
framework
::
proto
::
OpDesc
PrepareOpDesc
(
const
framework
::
proto
::
OpDesc
&
base_desc
,
const
std
::
string
&
bias
,
const
std
::
string
&
activation
,
const
std
::
string
&
output
)
{
auto
proto
=
base_desc
;
framework
::
OpDesc
desc
(
proto
,
nullptr
);
desc
.
SetType
(
"conv2d_fusion"
);
desc
.
SetInput
(
"Bias"
,
{
bias
});
desc
.
SetInput
(
"ResidualData"
,
{});
desc
.
SetAttr
(
"activation"
,
activation
);
desc
.
SetOutput
(
"Output"
,
{
output
});
desc
.
SetAttr
(
"is_test"
,
true
);
desc
.
SetAttr
(
"use_cudnn"
,
false
);
desc
.
Flush
();
return
*
desc
.
Proto
();
}
std
::
unique_ptr
<
ir
::
Graph
>
ConvElementwiseAddActFusePass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
const
std
::
string
pattern_name
=
"conv_elementwise_add_act_fuse"
;
FusePassBase
::
Init
(
pattern_name
,
graph
.
get
());
GraphPatternDetector
gpd
;
auto
*
x
=
gpd
.
mutable_pattern
()
->
NewNode
(
"x"
)
->
assert_is_op_input
(
"conv2d"
,
"Input"
)
->
AsInput
();
patterns
::
ConvElementwiseaddAct
pattern
(
gpd
.
mutable_pattern
(),
pattern_name
);
pattern
(
x
);
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
GET_NODES
;
auto
base_op_desc
=
*
conv_op
->
Op
()
->
Proto
();
std
::
string
bias_name
=
elementwise_add_in_y
->
Name
();
std
::
string
act_op_type
=
act_op
->
Op
()
->
Type
();
std
::
string
act_op_out
=
act_out
->
Name
();
auto
new_op_proto
=
PrepareOpDesc
(
base_op_desc
,
bias_name
,
act_op_type
,
act_op_out
);
framework
::
OpDesc
new_op_desc
(
new_op_proto
,
nullptr
);
// Create a new node for the fused op.
auto
*
new_conv_op
=
graph
->
CreateOpNode
(
&
new_op_desc
);
// Link inputs and outputs.
PADDLE_ENFORCE
(
subgraph
.
count
(
x
));
auto
*
conv_in_node
=
subgraph
.
at
(
x
);
IR_NODE_LINK_TO
(
conv_in_node
,
new_conv_op
);
// Input
IR_NODE_LINK_TO
(
conv_filter
,
new_conv_op
);
// Filter
IR_NODE_LINK_TO
(
elementwise_add_in_y
,
new_conv_op
);
// Bias
IR_NODE_LINK_TO
(
new_conv_op
,
act_out
);
// Output
// Delete the unneeded nodes.
GraphSafeRemoveNodes
(
graph
.
get
(),
{
conv_op
,
conv_out
,
elementwise_add_op
,
elementwise_add_out
,
act_op
});
};
gpd
(
graph
.
get
(),
handler
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
conv_elementwise_add_act_fuse_pass
,
paddle
::
framework
::
ir
::
ConvElementwiseAddActFusePass
);
paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
ConvElementwiseAddActFusePass
:
public
FusePassBase
{
public:
virtual
~
ConvElementwiseAddActFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph.h
浏览文件 @
740e1626
...
...
@@ -73,14 +73,21 @@ class Graph {
}
bool
Has
(
const
std
::
string
&
attr_name
)
const
{
return
attrs_
.
find
(
attr_name
)
!=
attrs_
.
end
()
;
return
attrs_
.
count
(
attr_name
)
>
0
;
}
template
<
typename
AttrType
>
AttrType
&
Get
(
const
std
::
string
&
attr_name
)
const
{
PADDLE_ENFORCE
(
Has
(
attr_name
),
"%s attr not registered for graph."
,
attr_name
);
try
{
return
*
boost
::
any_cast
<
AttrType
*>
(
attrs_
.
at
(
attr_name
));
}
catch
(
boost
::
bad_any_cast
&
)
{
PADDLE_THROW
(
"Invalid attribute type of %s error, expected: %s, actual: %s"
,
attr_name
,
typeid
(
AttrType
*
).
name
(),
attrs_
.
at
(
attr_name
).
type
().
name
());
}
}
template
<
typename
AttrType
>
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
740e1626
...
...
@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
...
...
@@ -25,6 +26,7 @@
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/printf.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
...
...
@@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
for
(
auto
&
node
:
GraphTraits
::
DFS
(
graph
))
{
for
(
const
auto
&
pdnode
:
pattern_
.
nodes
())
{
if
(
pdnode
->
Tell
(
&
node
))
{
VLOG
(
4
)
<<
"
pdnode "
<<
pdnode
->
name
()
<<
" marked"
;
VLOG
(
4
)
<<
"
Node "
<<
node
.
Name
()
<<
" marked as "
<<
pdnode
->
name
()
;
pdnodes2nodes_
[
pdnode
.
get
()].
insert
(
&
node
);
}
}
...
...
@@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
return
out_var
;
}
std
::
unordered_set
<
std
::
string
>
conv_act_set
({
"identity"
,
"sigmoid"
,
"relu"
,
"relu6"
,
"relux"
,
"tanh"
,
"band_pass"
});
PDNode
*
patterns
::
ConvElementwiseaddAct
::
operator
()(
PDNode
*
conv_in
)
{
conv_in
->
AsInput
();
auto
conv_op
=
pattern
->
NewNode
(
conv_op_repr
())
->
assert_is_op
(
"conv2d"
);
auto
conv_out
=
pattern
->
NewNode
(
conv_out_repr
())
->
assert_is_op_output
(
"conv2d"
)
->
assert_is_op_input
(
"elementwise_add"
,
"X"
)
->
AsIntermediate
();
auto
conv_filter
=
pattern
->
NewNode
(
conv_filter_repr
())
->
assert_is_op_input
(
"conv2d"
,
"Filter"
)
->
AsInput
();
auto
elementwise_add_op
=
pattern
->
NewNode
(
elementwise_add_op_repr
())
->
assert_is_op
(
"elementwise_add"
);
auto
elementwise_add_in_y
=
pattern
->
NewNode
(
elementwise_add_in_y_repr
())
->
assert_is_op_input
(
"elementwise_add"
,
"Y"
)
->
AsInput
();
auto
elementwise_add_out
=
pattern
->
NewNode
(
elementwise_add_out_repr
())
->
assert_is_op_output
(
"elementwise_add"
)
->
AsIntermediate
();
auto
act_op
=
pattern
->
NewNode
(
act_op_repr
())
->
assert_is_op
()
->
assert_more
([
&
](
Node
*
node
)
{
auto
op_type
=
node
->
Name
();
return
conv_act_set
.
count
(
op_type
);
});
auto
act_out
=
pattern
->
NewNode
(
act_out_repr
())
->
assert_is_var
()
// is activation op's output.
->
assert_more
([
&
](
Node
*
node
)
{
for
(
auto
*
in_op
:
node
->
inputs
)
{
if
(
conv_act_set
.
count
(
in_op
->
Name
()))
{
return
true
;
}
}
return
false
;
})
->
AsOutput
();
conv_op
->
LinksFrom
({
conv_in
,
conv_filter
});
conv_out
->
LinksFrom
({
conv_op
});
elementwise_add_op
->
LinksFrom
({
conv_out
,
elementwise_add_in_y
})
.
LinksTo
({
elementwise_add_out
});
act_op
->
LinksFrom
({
elementwise_add_out
}).
LinksTo
({
act_out
});
return
act_out
;
}
PDNode
*
patterns
::
ConvElementwiseadd2Act
::
operator
()(
PDNode
*
conv_in
)
{
auto
conv_op
=
pattern
->
NewNode
(
conv_op_repr
())
->
assert_is_op
(
"conv2d"
);
auto
conv_filter
=
pattern
->
NewNode
(
conv_filter_repr
())
->
assert_is_op_input
(
"conv2d"
,
"Filter"
)
->
AsInput
();
auto
conv_out
=
pattern
->
NewNode
(
conv_out_repr
())
->
assert_is_op_output
(
"conv2d"
)
->
assert_is_op_input
(
"elementwise_add"
,
"X"
)
->
AsIntermediate
();
auto
elementwise_add_op
=
pattern
->
NewNode
(
elementwise_add_op_repr
())
->
assert_is_op
(
"elementwise_add"
);
auto
elementwise_add_in_y
=
pattern
->
NewNode
(
elementwise_add_in_y_repr
())
->
assert_is_op_input
(
"elementwise_add"
,
"Y"
)
->
AsInput
();
auto
elementwise_add_out
=
pattern
->
NewNode
(
elementwise_add_out_repr
())
->
assert_is_op_output
(
"elementwise_add"
)
->
assert_is_op_input
(
"elementwise_add"
,
"X"
)
->
AsIntermediate
();
auto
elementwise_add_op_1
=
pattern
->
NewNode
(
elementwise_add_op_1_repr
())
->
assert_is_op
(
"elementwise_add"
);
auto
elementwise_add_in_y_1
=
pattern
->
NewNode
(
elementwise_add_in_y_1_repr
())
->
assert_is_op_input
(
"elementwise_add"
,
"Y"
)
->
AsInput
();
auto
elementwise_add_out_1
=
pattern
->
NewNode
(
elementwise_add_out_1_repr
())
->
assert_is_op_output
(
"elementwise_add"
)
->
AsIntermediate
();
auto
act_op
=
pattern
->
NewNode
(
act_op_repr
())
->
assert_is_op
()
->
assert_more
([
&
](
Node
*
node
)
{
auto
op_type
=
node
->
Name
();
return
conv_act_set
.
count
(
op_type
);
});
auto
act_out
=
pattern
->
NewNode
(
act_out_repr
())
->
assert_is_var
()
// is activation op's output.
->
assert_more
([
&
](
Node
*
node
)
{
for
(
auto
*
in_op
:
node
->
inputs
)
{
if
(
conv_act_set
.
count
(
in_op
->
Name
()))
{
return
true
;
}
}
return
false
;
})
->
AsOutput
();
conv_op
->
LinksFrom
({
conv_in
,
conv_filter
}).
LinksTo
({
conv_out
});
elementwise_add_op
->
LinksFrom
({
conv_out
,
elementwise_add_in_y
})
.
LinksTo
({
elementwise_add_out
});
elementwise_add_op_1
->
LinksFrom
(
{
elementwise_add_out
,
elementwise_add_in_y_1
});
act_op
->
LinksFrom
({
elementwise_add_out_1
}).
LinksTo
({
act_out
});
return
act_out
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/graph_pattern_detector.h
浏览文件 @
740e1626
...
...
@@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase {
PATTERN_DECL_NODE
(
elementwise_add_y
);
PATTERN_DECL_NODE
(
elementwise_add_out
);
};
// Conv + ElementwiseAdd + an activation
// This pattern can futher fuse the conv related ops after the conv+bn fusion.
struct
ConvElementwiseaddAct
:
public
PatternBase
{
ConvElementwiseaddAct
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"conv_elementwiseadd_act"
)
{}
PDNode
*
operator
()(
PDNode
*
conv_in
);
PATTERN_DECL_NODE
(
conv_op
);
PATTERN_DECL_NODE
(
conv_out
);
PATTERN_DECL_NODE
(
conv_filter
);
PATTERN_DECL_NODE
(
elementwise_add_op
);
PATTERN_DECL_NODE
(
elementwise_add_in_y
);
// input
PATTERN_DECL_NODE
(
elementwise_add_out
);
PATTERN_DECL_NODE
(
act_op
);
PATTERN_DECL_NODE
(
act_out
);
};
// Conv + ElementwiseAdd + ElementwiseAdd + Activation
struct
ConvElementwiseadd2Act
:
public
PatternBase
{
ConvElementwiseadd2Act
(
PDPattern
*
pattern
,
const
std
::
string
&
name_scope
)
:
PatternBase
(
pattern
,
name_scope
,
"conv_elementwiseadd2_elementwiseadd_act"
)
{}
PDNode
*
operator
()(
PDNode
*
conv_in
);
PATTERN_DECL_NODE
(
conv_op
);
PATTERN_DECL_NODE
(
conv_filter
);
PATTERN_DECL_NODE
(
conv_out
);
PATTERN_DECL_NODE
(
elementwise_add_op
);
PATTERN_DECL_NODE
(
elementwise_add_in_y
);
// input
PATTERN_DECL_NODE
(
elementwise_add_out
);
PATTERN_DECL_NODE
(
elementwise_add_op_1
);
PATTERN_DECL_NODE
(
elementwise_add_in_y_1
);
// input
PATTERN_DECL_NODE
(
elementwise_add_out_1
);
PATTERN_DECL_NODE
(
act_op
);
PATTERN_DECL_NODE
(
act_out
);
};
}
// namespace patterns
// Link two ir::Nodes from each other.
...
...
paddle/fluid/framework/ir/pass.h
浏览文件 @
740e1626
...
...
@@ -51,11 +51,18 @@ class Pass {
AttrType
&
Get
(
const
std
::
string
&
attr_name
)
const
{
PADDLE_ENFORCE
(
attrs_
.
find
(
attr_name
)
!=
attrs_
.
end
(),
"%s attr not registered for pass."
,
attr_name
);
try
{
return
*
boost
::
any_cast
<
AttrType
*>
(
attrs_
.
at
(
attr_name
));
}
catch
(
boost
::
bad_any_cast
&
)
{
PADDLE_THROW
(
"Invalid attribute type of %s error, expected: %s, actual: %s"
,
attr_name
,
typeid
(
AttrType
*
).
name
(),
attrs_
.
at
(
attr_name
).
type
().
name
());
}
}
bool
Has
(
const
std
::
string
&
attr_name
)
const
{
return
attrs_
.
find
(
attr_name
)
!=
attrs_
.
end
()
;
return
attrs_
.
count
(
attr_name
)
>
0
;
}
void
Erase
(
const
std
::
string
&
attr_name
)
{
...
...
paddle/fluid/framework/op_kernel_type_test.cc
浏览文件 @
740e1626
...
...
@@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) {
OpKernelType
op_kernel_type2
(
DataType
::
FP16
,
CUDAPlace
(
0
),
DataLayout
::
kNCHW
,
LibraryType
::
kCUDNN
);
ASSERT_EQ
(
paddle
::
framework
::
KernelTypeToString
(
op_kernel_type2
),
"data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
"data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
"CUDAPlace(0)]:library_"
"type[CUDNN]"
);
}
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
740e1626
...
...
@@ -879,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
t
=
&
(
var
->
Get
<
SelectedRows
>
().
value
());
}
if
(
t
!=
nullptr
)
{
PADDLE_ENFORCE
(
t
->
IsInitialized
(),
"Input %s is not initialized: %s"
,
ipt_name
,
DebugString
());
int
tmp
=
static_cast
<
int
>
(
t
->
type
());
PADDLE_ENFORCE
(
tmp
==
data_type
||
data_type
==
-
1
,
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
740e1626
...
...
@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/platform/profiler.h"
...
...
@@ -72,6 +73,26 @@ class ParallelExecutorPrivate {
}
}
}
std
::
unique_ptr
<
ir
::
Graph
>
PrepareGCAndRefCnts
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
size_t
max_memory_size
);
inline
bool
HasGarbageCollectors
()
const
{
return
!
gcs_
.
empty
();
}
void
ResetRuntimeReferenceCount
(
const
std
::
vector
<
std
::
string
>
&
fetch_tensors
,
const
std
::
string
&
fetched_var_name
)
{
for
(
size_t
i
=
0
;
i
<
runtime_ref_cnts_
.
size
();
++
i
)
{
for
(
auto
&
pair
:
global_ref_cnts_
[
i
])
{
runtime_ref_cnts_
[
i
][
pair
.
first
]
=
pair
.
second
;
}
for
(
auto
&
fetch_name
:
fetch_tensors
)
{
runtime_ref_cnts_
[
i
].
erase
(
fetch_name
);
}
runtime_ref_cnts_
[
i
].
erase
(
fetched_var_name
);
}
}
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
local_scopes_
;
Scope
*
global_scope_
;
// not owned
...
...
@@ -83,8 +104,76 @@ class ParallelExecutorPrivate {
bool
own_local_scope_
;
bool
use_cuda_
;
bool
use_all_reduce_
;
// global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
// then keeps unchanged
// Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_
std
::
vector
<
details
::
ReferenceCountMap
>
global_ref_cnts_
;
std
::
vector
<
details
::
AtomicReferenceCountMap
>
runtime_ref_cnts_
;
details
::
GarbageCollectorMap
gcs_
;
};
std
::
unique_ptr
<
ir
::
Graph
>
ParallelExecutorPrivate
::
PrepareGCAndRefCnts
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
,
size_t
max_memory_size
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
auto
&
place
=
places_
[
i
];
if
(
gcs_
.
count
(
place
)
>
0
)
{
continue
;
}
std
::
unique_ptr
<
GarbageCollector
>
gc
;
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
is_gpu_place
(
place
))
{
if
(
IsFastEagerDeletionModeEnabled
())
{
gc
.
reset
(
new
UnsafeFastGPUGarbageCollector
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
max_memory_size
));
}
else
{
gc
.
reset
(
new
StreamGarbageCollector
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place
),
max_memory_size
));
}
VLOG
(
10
)
<<
"Created "
<<
i
<<
"-th GarbageCollector at "
<<
place
;
}
else
{
#endif
if
(
platform
::
is_cpu_place
(
place
))
{
gc
.
reset
(
new
CPUGarbageCollector
(
boost
::
get
<
platform
::
CPUPlace
>
(
place
),
max_memory_size
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
}
else
{
PADDLE_THROW
(
"Unsupported place for garbage collection"
);
}
#ifdef PADDLE_WITH_CUDA
}
#endif
gcs_
.
emplace
(
place
,
std
::
move
(
gc
));
}
if
(
!
gcs_
.
empty
())
{
std
::
vector
<
details
::
LastLiveOpsOfVars
>
last_live_ops_of_vars
;
auto
ref_cnt_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"reference_count_pass"
);
ref_cnt_pass
->
SetNotOwned
(
details
::
kGlobalReferenceCount
,
&
global_ref_cnts_
);
ref_cnt_pass
->
SetNotOwned
(
details
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
graph
=
ref_cnt_pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
10
)
<<
"ReferenceCountPass Applied"
;
auto
eager_deletion_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"eager_deletion_pass"
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kRuntimeReferenceCount
,
&
runtime_ref_cnts_
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kGarbageCollector
,
&
gcs_
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
eager_deletion_pass
->
SetNotOwned
(
details
::
kAllPlaces
,
&
places_
);
graph
=
eager_deletion_pass
->
Apply
(
std
::
move
(
graph
));
VLOG
(
10
)
<<
"EagerDeletionPass Applied"
;
}
return
graph
;
}
std
::
vector
<
Scope
*>
&
ParallelExecutor
::
GetLocalScopes
()
{
return
member_
->
local_scopes_
;
}
...
...
@@ -151,36 +240,18 @@ ParallelExecutor::ParallelExecutor(
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
params
,
member_
->
local_scopes_
,
member_
->
use_cuda_
,
member_
->
nccl_ctxs_
.
get
());
auto
max_memory_size
=
GetEagerDeletionThreshold
();
if
(
max_memory_size
>=
0
)
{
for
(
auto
&
place
:
member_
->
places_
)
{
if
(
!
platform
::
is_gpu_place
(
place
))
continue
;
auto
gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
place
);
if
(
gcs_
[
gpu_place
.
device
]
==
nullptr
)
{
ref_cnts_
[
gpu_place
.
device
].
reset
(
new
details
::
ReferenceCountMap
());
cur_ref_cnts_
[
gpu_place
.
device
].
reset
(
new
details
::
AtomicReferenceCountMap
());
gcs_
[
gpu_place
.
device
].
reset
(
new
StreamGarbageCollector
<
Tensor
>
(
gpu_place
,
max_memory_size
));
}
}
if
(
!
gcs_
.
empty
())
{
auto
ref_cnt_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"reference_count_pass"
);
ref_cnt_pass
->
SetNotOwned
(
details
::
kGlobalReferenceCount
,
&
ref_cnts_
);
ref_cnt_pass
->
SetNotOwned
(
details
::
kCurReferenceCount
,
&
cur_ref_cnts_
);
ref_cnt_pass
->
SetNotOwned
(
details
::
kGarbageCollector
,
&
gcs_
);
graph
=
ref_cnt_pass
->
Apply
(
std
::
move
(
graph
));
graph
->
SetNotOwned
(
"garbage_collector"
,
&
gcs_
);
}
}
#else
std
::
unique_ptr
<
ir
::
Graph
>
graph
=
build_strategy
.
Apply
(
main_program
,
member_
->
places_
,
loss_var_name
,
params
,
member_
->
local_scopes_
,
member_
->
use_cuda_
);
#endif
auto
max_memory_size
=
GetEagerDeletionThreshold
();
if
(
max_memory_size
>=
0
)
{
graph
=
member_
->
PrepareGCAndRefCnts
(
std
::
move
(
graph
),
static_cast
<
size_t
>
(
max_memory_size
));
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std
::
vector
<
details
::
VariableInfo
>
var_infos
;
...
...
@@ -300,18 +371,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
#endif
platform
::
RecordBlock
b
(
0
);
#ifdef PADDLE_WITH_CUDA
if
(
!
gcs_
.
empty
())
{
ResetReferenceCount
();
for
(
auto
&
pair
:
cur_ref_cnts_
)
{
auto
&
name_map
=
*
(
pair
.
second
);
for
(
auto
&
fetch_name
:
fetch_tensors
)
{
name_map
.
erase
(
fetch_name
);
}
name_map
.
erase
(
fetched_var_name
);
if
(
member_
->
HasGarbageCollectors
())
{
member_
->
ResetRuntimeReferenceCount
(
fetch_tensors
,
fetched_var_name
);
}
}
#endif
auto
fetch_data
=
member_
->
executor_
->
Run
(
fetch_tensors
);
*
member_
->
global_scope_
->
Var
(
fetched_var_name
)
->
GetMutable
<
FeedFetchList
>
()
=
fetch_data
;
...
...
@@ -355,13 +417,11 @@ ParallelExecutor::~ParallelExecutor() {
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
// member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_
.
reset
();
delete
member_
;
}
}
// namespace framework
}
// namespace paddle
#ifdef PADDLE_WITH_CUDA
USE_PASS
(
reference_count_pass
);
#endif
USE_PASS
(
eager_deletion_pass
);
paddle/fluid/framework/parallel_executor.h
浏览文件 @
740e1626
...
...
@@ -14,7 +14,6 @@ limitations under the License. */
#pragma once
#include <atomic>
#include <string>
#include <unordered_map>
#include <unordered_set>
...
...
@@ -29,10 +28,6 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/reference_count_pass.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
@@ -75,24 +70,7 @@ class ParallelExecutor {
private:
void
BCastParamsToDevices
(
const
std
::
unordered_set
<
std
::
string
>
&
vars
)
const
;
std
::
unique_ptr
<
ParallelExecutorPrivate
>
member_
;
#ifdef PADDLE_WITH_CUDA
// ref_cnts_ is only initialized when ParallelExecutor constructs, and then
// keeps unchanged
// Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
details
::
DeviceReferenceCountMap
ref_cnts_
;
details
::
AtomicDeviceReferenceCountMap
cur_ref_cnts_
;
details
::
DeviceGarbageCollectorMap
gcs_
;
void
ResetReferenceCount
()
{
for
(
auto
&
pair1
:
ref_cnts_
)
{
for
(
auto
&
pair2
:
*
(
pair1
.
second
))
{
(
*
(
cur_ref_cnts_
[
pair1
.
first
]))[
pair2
.
first
]
=
pair2
.
second
;
}
}
}
#endif
ParallelExecutorPrivate
*
member_
;
};
}
// namespace framework
...
...
paddle/fluid/framework/scope.cc
浏览文件 @
740e1626
...
...
@@ -38,6 +38,10 @@ DEFINE_double(
"Memory size threshold (GB) when the garbage collector clear tensors."
"Disabled when this value is less than 0"
);
DEFINE_bool
(
fast_eager_deletion_mode
,
false
,
"Fast eager deletion mode. If enabled, memory would release "
"immediately without waiting GPU kernel ends."
);
// When in inference scenario, the scopes will not be written by two threads in
// a mean time, but a scope may be read by multiple threads concurrently, and
// the mutex will cause serious performance issue.
...
...
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
(
static_cast
<
int64_t
>
(
1
)
<<
30
));
}
bool
IsFastEagerDeletionModeEnabled
()
{
return
FLAGS_fast_eager_deletion_mode
;
}
Scope
::~
Scope
()
{
DropKids
();
}
Scope
&
Scope
::
NewScope
()
const
{
...
...
paddle/fluid/framework/scope.h
浏览文件 @
740e1626
...
...
@@ -27,6 +27,7 @@ namespace paddle {
namespace
framework
{
int64_t
GetEagerDeletionThreshold
();
bool
IsFastEagerDeletionModeEnabled
();
class
Scope
;
...
...
paddle/fluid/framework/tensor.h
浏览文件 @
740e1626
...
...
@@ -14,15 +14,14 @@ limitations under the License. */
#pragma once
#include <paddle/fluid/framework/framework.pb.h>
#include <cstdint>
#include <cstring>
#include <memory>
#include <typeindex>
#include <vector>
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -159,6 +158,10 @@ class Tensor {
const
std
::
shared_ptr
<
memory
::
Allocation
>&
Holder
()
const
{
return
holder_
;
}
size_t
offset
()
const
{
return
offset_
;
}
std
::
shared_ptr
<
memory
::
Allocation
>
MoveMemoryHolder
()
{
return
std
::
move
(
holder_
);
}
private:
/*! holds the memory block if allocated. */
std
::
shared_ptr
<
memory
::
Allocation
>
holder_
;
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
740e1626
...
...
@@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) {
}
TEST
(
AnalysisPredictor
,
analysis_on
)
{
AnalysisConfig
config
(
false
);
#ifdef PADDLE_WITH_CUDA
AnalysisConfig
config
(
true
);
config
.
fraction_of_gpu_memory
=
0.15
;
#else
AnalysisConfig
config
;
#endif
config
.
model_dir
=
FLAGS_dirname
;
config
.
enable_ir_optim
=
true
;
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
740e1626
...
...
@@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
if
(
t
->
type
()
==
framework
::
proto
::
VarType
::
INT64
)
{
pt
.
data
.
Reset
(
t
->
data
<
void
>
(),
t
->
numel
()
*
sizeof
(
int64_t
));
pt
.
dtype
=
PaddleDType
::
INT64
;
}
else
if
(
t
->
type
()
==
framework
::
proto
::
VarType
::
INT
32
)
{
}
else
if
(
t
->
type
()
==
framework
::
proto
::
VarType
::
FP
32
)
{
pt
.
data
.
Reset
(
t
->
data
<
void
>
(),
t
->
numel
()
*
sizeof
(
float
));
pt
.
dtype
=
PaddleDType
::
FLOAT32
;
}
else
{
...
...
paddle/fluid/inference/api/paddle_pass_builder.h
浏览文件 @
740e1626
...
...
@@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy {
public:
GpuPassStrategy
()
:
PassStrategy
({})
{
passes_
.
assign
({
"infer_clean_graph_pass"
,
"conv_bn_fuse_pass"
,
"infer_clean_graph_pass"
,
//
"conv_bn_fuse_pass"
,
//
"conv_elementwise_add_act_fuse_pass"
,
//
"conv_elementwise_add2_act_fuse_pass"
,
//
});
}
...
...
paddle/fluid/inference/io.cc
浏览文件 @
740e1626
...
...
@@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
for
(
auto
*
var
:
global_block
.
AllVars
())
{
if
(
IsPersistable
(
var
))
{
VLOG
(
3
)
<<
"persistable variable's name: "
<<
var
->
Name
();
VLOG
(
4
)
<<
"persistable variable's name: "
<<
var
->
Name
();
framework
::
VarDesc
*
new_var
=
load_block
->
Var
(
var
->
Name
());
new_var
->
SetShape
(
var
->
GetShape
());
...
...
paddle/fluid/inference/tests/api/trt_models_tester.cc
浏览文件 @
740e1626
...
...
@@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
std
::
vector
<
PaddleTensor
>
outputs
;
if
(
use_analysis
||
use_tensorrt
)
{
contrib
::
AnalysisConfig
config
(
true
);
config
.
pass_builder
()
->
TurnOnDebug
();
SetConfig
<
contrib
::
AnalysisConfig
>
(
&
config
,
model_dir
,
true
,
use_tensorrt
,
FLAGS_batch_size
);
TestPrediction
(
reinterpret_cast
<
PaddlePredictor
::
Config
*>
(
&
config
),
...
...
@@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) {
profile
(
model_dir
,
/* use_analysis */
true
,
FLAGS_use_tensorrt
);
}
TEST
(
resnext50
,
compare_analysis_native
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/resnext50"
;
compare
(
model_dir
,
false
/*use tensorrt*/
);
}
TEST
(
TensorRT_mobilenet
,
analysis
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
compare
(
model_dir
,
/* use_tensorrt */
false
);
compare
(
model_dir
,
false
/* use_tensorrt */
);
}
TEST
(
AnalysisPredictor
,
use_gpu
)
{
std
::
string
model_dir
=
FLAGS_infer_model
+
"/"
+
"mobilenet"
;
AnalysisConfig
config
(
true
);
config
.
model_dir
=
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
config
.
pass_builder
()
->
TurnOnDebug
();
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
inputs_all
;
auto
predictor
=
CreatePaddlePredictor
(
config
);
SetFakeImageInput
(
&
inputs_all
,
model_dir
,
false
,
"__model__"
,
""
);
std
::
vector
<
PaddleTensor
>
outputs
;
for
(
auto
&
input
:
inputs_all
)
{
ASSERT_TRUE
(
predictor
->
Run
(
input
,
&
outputs
));
}
}
}
// namespace inference
...
...
paddle/fluid/operators/controlflow/CMakeLists.txt
浏览文件 @
740e1626
include
(
operators
)
register_operators
()
register_operators
(
DEPS naive_executor
)
file
(
APPEND
${
pybind_file
}
"USE_OP(less_than);
\n
USE_OP(logical_and);
\n
USE_NO_KERNEL_OP(read_from_array);
\n
"
)
paddle/fluid/operators/controlflow/while_op.cc
浏览文件 @
740e1626
...
...
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
static
constexpr
char
kX
[]
=
"X"
;
static
constexpr
char
kXGRAD
[]
=
"X@GRAD"
;
static
constexpr
char
kOutputs
[]
=
"Out"
;
static
constexpr
char
kSkipEagerDeletionVars
[]
=
"skip_eager_deletion_vars"
;
namespace
{
// NOLINT
static
std
::
string
GetSkipEagerDeletionVarsDebugString
(
const
std
::
vector
<
std
::
string
>
&
vars
)
{
std
::
string
str
=
"Skip "
+
std
::
to_string
(
vars
.
size
())
+
" var(s) in eager deletion mode: "
;
for
(
auto
&
var
:
vars
)
{
str
.
append
(
var
);
str
.
push_back
(
' '
);
}
return
str
;
}
}
// NOLINT
class
WhileOp
:
public
framework
::
OperatorBase
{
public:
...
...
@@ -59,7 +73,10 @@ class WhileOp : public framework::OperatorBase {
"Condition of while op must in CPU memory."
);
bool
is_test
=
Attr
<
bool
>
(
"is_test"
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
());
auto
&
skip_vars
=
Attr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
);
VLOG
(
2
)
<<
GetSkipEagerDeletionVarsDebugString
(
skip_vars
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
(),
skip_vars
);
while
(
cond
.
data
<
bool
>
()[
0
])
{
auto
&
current_scope
=
scope
.
NewScope
();
step_scopes
->
push_back
(
&
current_scope
);
...
...
@@ -96,6 +113,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
,
"Vars that would skip eager deletion."
"Users should not set this manually."
)
.
SetDefault
(
std
::
vector
<
std
::
string
>
());
AddComment
(
R"DOC(
)DOC"
);
}
...
...
@@ -119,7 +140,10 @@ class WhileGradOp : public framework::OperatorBase {
framework
::
Executor
executor
(
dev_place
);
auto
*
block
=
Attr
<
framework
::
BlockDesc
*>
(
kStepBlock
);
auto
*
program
=
block
->
Program
();
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
());
auto
&
skip_vars
=
Attr
<
std
::
vector
<
std
::
string
>>
(
kSkipEagerDeletionVars
);
VLOG
(
2
)
<<
GetSkipEagerDeletionVarsDebugString
(
skip_vars
);
auto
ctx
=
executor
.
Prepare
(
*
program
,
block
->
ID
(),
skip_vars
);
auto
*
step_scopes
=
scope
.
FindVar
(
Input
(
kStepScopes
))
->
GetMutable
<
StepScopeVar
>
();
...
...
@@ -341,6 +365,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
// while operator could be renamed.
while_grad
->
SetAttr
(
"original_output_grad"
,
output_grads_list
);
while_grad
->
SetAttr
(
kSkipEagerDeletionVars
,
std
::
vector
<
std
::
string
>
());
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
while_grad
);
}
};
...
...
paddle/fluid/operators/conv_op.cc
浏览文件 @
740e1626
...
...
@@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std
::
vector
<
int
>
dilations
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"dilations"
);
PADDLE_ENFORCE
(
in_dims
.
size
()
==
4
||
in_dims
.
size
()
==
5
,
"Conv intput should be 4-D or 5-D tensor."
);
"Conv intput should be 4-D or 5-D tensor, get %u"
,
in_dims
.
size
());
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
filter_dims
.
size
(),
"Conv input dimension and filter dimension should be the same."
);
...
...
paddle/fluid/operators/cudnn_lstm_op.cu.cc
浏览文件 @
740e1626
...
...
@@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
}
CudnnRNNCache
*
cudnn_rnn_cache
=
nullptr
;
if
(
cache_var
->
IsInitialized
())
{
// const_cast is usually bad.
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
}
else
{
// const_cast is usually bad.
cudnn_rnn_cache
=
const_cast
<
framework
::
Variable
*>
(
cache_var
)
->
GetMutable
<
CudnnRNNCache
>
();
std
::
random_device
rnd
;
...
...
paddle/fluid/operators/distributed/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
if
(
WITH_GRPC
)
grpc_library
(
sendrecvop_
g
rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
grpc_library
(
sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory
)
...
...
@@ -20,36 +20,43 @@ if(WITH_GRPC)
set_source_files_properties
(
grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_test
(
grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_
g
rpc scope profiler math_function SERIAL
)
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL
)
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_
g
rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL
)
DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL
)
cc_test
(
varhandle_test SRCS varhandle_test.cc DEPS profiler
)
if
(
WITH_GPU
)
cc_test
(
collective_server_test SRCS collective_server_test.cc
DEPS sendrecvop_
g
rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
selected_rows_functor scope math_function SERIAL
)
endif
()
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_
g
rpc memory
)
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory
)
else
()
set_source_files_properties
(
brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
collective_client.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
brpc_library
(
sendrecvop_
b
rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
brpc_library
(
sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
collective_client.cc collective_server.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory
)
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_
b
rpc memory
)
cc_library
(
parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory
)
set
(
brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy
)
set
(
brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
proto_desc lookup_sparse_table_op snappystream snappy zlib
)
cc_test
(
b
rpc_server_test SRCS rpc_server_test.cc
cc_test
(
rpc_server_test SRCS rpc_server_test.cc
DEPS
${
brpc_test_depends
}
SERIAL
)
cc_test
(
brpc_serde_test SRCS brpc_serde_test.cc
DEPS
${
brpc_test_depends
}
SERIAL
)
if
(
WITH_GPU
)
cc_test
(
collective_server_test SRCS collective_server_test.cc
DEPS
${
brpc_test_depends
}
selected_rows_functor scope math_function SERIAL
)
endif
()
endif
()
paddle/fluid/operators/distributed/brpc_client.cc
浏览文件 @
740e1626
...
...
@@ -14,135 +14,316 @@
#include "paddle/fluid/operators/distributed/brpc_client.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
DEFINE_int32
(
brpc_channel_num
,
24
,
"Number of channels to send requests connected to one server"
);
DEFINE_int32
(
timeout_ms
,
30000
,
"RPC timeout in milliseconds"
);
DEFINE_int32
(
max_retry
,
3
,
"Max retries(not including the first RPC)"
);
BRPCClient
::~
BRPCClient
()
{
Wait
();
}
void
HandleSendResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VoidMessage
*
response
)
{
void
HandleSendResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VoidMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
)
{
// std::unique_ptr makes sure cntl/response will be deleted before returning.
std
::
unique_ptr
<
brpc
::
Controller
>
cntl_guard
(
cntl
);
std
::
unique_ptr
<
sendrecv
::
VoidMessage
>
response_guard
(
response
);
// this channel can be used by other now.
ch_ptr
->
Push
(
ch_ctx
);
if
(
cntl
->
Failed
())
{
LOG
(
WARNING
)
<<
"Fail to send EchoRequest, "
<<
cntl
->
ErrorText
();
LOG
(
FATAL
)
<<
"Fail to send SendVar: "
<<
var_h
->
name
()
<<
", error text: "
<<
cntl
->
ErrorText
();
var_h
->
Finish
(
false
);
cls
->
DecreaseReqCount
();
return
;
}
LOG
(
INFO
)
<<
"Received response from "
<<
cntl
->
remote_side
()
<<
" latency="
<<
cntl
->
latency_us
()
<<
"us"
;
var_h
->
Finish
(
true
);
cls
->
DecreaseReqCount
();
VLOG
(
4
)
<<
"HandleSendResponse from: "
<<
cntl
->
remote_side
()
<<
", varname: "
<<
var_h
->
name
()
<<
", latency: "
<<
cntl
->
latency_us
()
<<
"us"
;
VLOG
(
4
)
<<
"Finish HandleSendResponse"
;
}
bool
BRPCClient
::
AsyncSendVar
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
var_name_val
=
var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"SendRPC"
;
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
framework
::
AsyncIO
(
[
var_name_val
,
p_ctx
,
ep_val
,
p_scope
,
time_out
,
ch_ptr
,
this
]
{
framework
::
AsyncIO
([
=
]
{
auto
ch_ctx
=
ch_ptr
->
Pop
();
brpc
::
Controller
*
cntl
=
new
brpc
::
Controller
();
sendrecv
::
VoidMessage
*
response
=
new
sendrecv
::
VoidMessage
();
cntl
->
set_timeout_ms
(
time_out
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleSendResponse
,
cntl
,
response
);
auto
*
var
=
p_scope
->
FindVar
(
var_name_val
);
sendrecv
::
VariableMessage
request
;
distributed
::
SerializeToIOBuf
(
var_name_val
,
var
,
*
p_ctx
,
&
request
,
&
cntl
->
request_attachment
(),
""
,
false
,
trainer_id_
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleSendResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
platform
::
RecordRPCEvent
record_event
(
method
,
p_ctx
);
ch_ctx
->
stub
->
SendVariable
(
cntl
,
&
request
,
response
,
done
);
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
var_h
->
Wait
();
}
});
req_count_
++
;
return
true
;
return
var_h
;
}
void
HandleFetchBarrierResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VariableMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
)
{
// std::unique_ptr makes sure cntl/response will be deleted before returning.
std
::
unique_ptr
<
brpc
::
Controller
>
cntl_guard
(
cntl
);
std
::
unique_ptr
<
sendrecv
::
VariableMessage
>
response_guard
(
response
);
// this channel can be used other now.
ch_ptr
->
Push
(
ch_ctx
);
if
(
cntl
->
Failed
())
{
LOG
(
FATAL
)
<<
"Fail to get HandleFetchBarrierResponse: "
<<
var_h
->
name
()
<<
", error text: "
<<
cntl
->
ErrorText
();
var_h
->
Finish
(
false
);
cls
->
DecreaseReqCount
();
return
;
}
var_h
->
Finish
(
true
);
cls
->
DecreaseReqCount
();
VLOG
(
4
)
<<
"HandleFetchBarrierResponse from: "
<<
cntl
->
remote_side
()
<<
", varname: "
<<
var_h
->
name
()
<<
", latency: "
<<
cntl
->
latency_us
()
<<
"us"
;
VLOG
(
4
)
<<
"Finish HandleFetchBarrierResponse"
;
}
void
HandleGetResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VariableMessage
*
response
)
{
sendrecv
::
VariableMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
)
{
// std::unique_ptr makes sure cntl/response will be deleted before returning.
std
::
unique_ptr
<
brpc
::
Controller
>
cntl_guard
(
cntl
);
std
::
unique_ptr
<
sendrecv
::
VariableMessage
>
response_guard
(
response
);
// this channel can be used other now.
ch_ptr
->
Push
(
ch_ctx
);
if
(
cntl
->
Failed
())
{
LOG
(
WARNING
)
<<
"Fail to send EchoRequest, "
<<
cntl
->
ErrorText
();
LOG
(
FATAL
)
<<
"Fail to GetVar: "
<<
var_h
->
name
()
<<
", error text: "
<<
cntl
->
ErrorText
();
cls
->
DecreaseReqCount
();
var_h
->
Finish
(
false
);
return
;
}
LOG
(
INFO
)
<<
"Received response from "
<<
cntl
->
remote_side
()
<<
" latency="
<<
cntl
->
latency_us
()
<<
"us"
;
// framework::Variable* outvar = nullptr;
// DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
VLOG
(
4
)
<<
"HandleGetResponse from: "
<<
cntl
->
remote_side
()
<<
", varname: "
<<
var_h
->
name
()
<<
", latency: "
<<
cntl
->
latency_us
()
<<
"us"
;
framework
::
Variable
*
outvar
=
nullptr
;
int
trainer_id
;
distributed
::
DeserializeFromIOBuf
(
*
response
,
cntl
->
response_attachment
(),
*
var_h
->
ctx
(),
var_h
->
scope
(),
&
outvar
,
&
trainer_id
);
VLOG
(
4
)
<<
"Finish HandleGetResponse"
;
cls
->
DecreaseReqCount
();
var_h
->
Finish
(
true
);
}
bool
BRPCClient
::
AsyncGetVar
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
_
AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
const
std
::
string
&
var_name
,
const
std
::
string
&
method_name
,
int64_t
time_out
)
{
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
var_name_val
=
var_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"GetRPC"
;
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
var_name_val
,
p_ctx
,
p_scope
));
framework
::
AsyncIO
([
=
]
{
auto
ch_ctx
=
ch_ptr
->
Pop
();
brpc
::
Controller
*
cntl
=
new
brpc
::
Controller
();
sendrecv
::
VariableMessage
*
response
=
new
sendrecv
::
VariableMessage
();
cntl
->
set_timeout_ms
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
var_name_val
);
req
.
set_trainer_id
(
trainer_id_
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleGetResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
platform
::
RecordRPCEvent
record_event
(
method
,
p_ctx
);
framework
::
AsyncIO
(
[
var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
time_out
,
ch
,
this
]
{});
if
(
method_name
==
"GetMonomerVariable"
)
{
ch_ctx
->
stub
->
GetMonomerVariable
(
cntl
,
&
req
,
response
,
done
);
}
else
{
ch_ctx
->
stub
->
GetVariable
(
cntl
,
&
req
,
response
,
done
);
}
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
var_h
->
Wait
();
}
});
req_count_
++
;
return
true
;
return
var_h
;
}
VarHandlePtr
BRPCClient
::
AsyncGetMonomerVariable
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
"GetMonomerVariable"
,
time_out
);
}
VarHandlePtr
BRPCClient
::
AsyncGetMonomerBarrier
(
const
std
::
string
&
ep
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
return
AsyncSendMessage
(
ep
,
"GetMonomerBarrier"
,
var_name
,
time_out
);
}
bool
BRPCClient
::
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
)
{
return
_AsyncGetVar
(
ep
,
ctx
,
scope
,
var_name
,
"GetVariable"
,
time_out
);
}
VarHandlePtr
BRPCClient
::
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
,
int64_t
time_out
)
{
const
platform
::
DeviceContext
*
p_ctx
=
&
ctx
;
const
std
::
string
ep_val
=
ep
;
const
std
::
string
in_var_name_val
=
in_var_name
;
const
std
::
string
out_var_name_val
=
out_var_name
;
const
std
::
string
table_name_val
=
table_name
;
const
framework
::
Scope
*
p_scope
=
&
scope
;
const
auto
ch
=
GetChannel
(
ep_val
);
const
auto
ch_ptr
=
GetChannel
(
ep_val
);
const
std
::
string
method
=
"PrefetchRPC"
;
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
out_var_name_val
,
p_ctx
,
p_scope
));
framework
::
AsyncIO
([
=
]
{
auto
ch_ctx
=
ch_ptr
->
Pop
();
brpc
::
Controller
*
cntl
=
new
brpc
::
Controller
();
sendrecv
::
VariableMessage
*
response
=
new
sendrecv
::
VariableMessage
();
cntl
->
set_timeout_ms
(
time_out
);
framework
::
AsyncIO
([
in_var_name_val
,
out_var_name_val
,
ep_val
,
p_scope
,
p_ctx
,
time_out
,
ch
,
this
]
{});
auto
*
var
=
p_scope
->
FindVar
(
in_var_name_val
);
sendrecv
::
VariableMessage
req
;
distributed
::
SerializeToIOBuf
(
in_var_name_val
,
var
,
*
p_ctx
,
&
req
,
&
cntl
->
request_attachment
(),
out_var_name_val
,
false
,
0
,
table_name_val
);
platform
::
RecordRPCEvent
record_event
(
method
,
p_ctx
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleGetResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
ch_ctx
->
stub
->
PrefetchVariable
(
cntl
,
&
req
,
response
,
done
);
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
var_h
->
Wait
();
}
});
req_count_
++
;
return
true
;
return
var_h
;
}
void
BRPCClient
::
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
req_count_
++
;
return
AsyncSendMessage
(
ep
,
"BatchBarrierRPC"
,
BATCH_BARRIER_MESSAGE
,
time_out
);
}
void
BRPCClient
::
AsyncSendFetchBarrier
(
const
std
::
string
&
ep
,
VarHandlePtr
BRPCClient
::
AsyncSendFetchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
auto
ch_ptr
=
GetChannel
(
ep
);
auto
ch_ctx
=
ch_ptr
->
Pop
();
brpc
::
Controller
*
cntl
=
new
brpc
::
Controller
();
sendrecv
::
VariableMessage
*
response
=
new
sendrecv
::
VariableMessage
();
cntl
->
set_timeout_ms
(
time_out
);
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
FETCH_BARRIER_MESSAGE
);
const
std
::
string
method
=
"FetchBarrierRPC"
;
// var handle
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
platform
::
RecordRPCEvent
record_event
(
method
,
nullptr
);
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleFetchBarrierResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
ch_ctx
->
stub
->
GetVariable
(
cntl
,
&
req
,
response
,
done
);
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
var_h
->
Wait
();
}
return
var_h
;
}
void
BRPCClient
::
Wait
()
{
bool
BRPCClient
::
Wait
()
{
VLOG
(
9
)
<<
"begin to brpcclient wait"
;
{
std
::
unique_lock
<
std
::
mutex
>
lk
(
sync_mutex_
);
sync_cond_
.
wait
(
lk
,
[
this
]
{
return
req_count_
==
0
;
});
}
VLOG
(
9
)
<<
"end to brpcclient wait"
;
return
true
;
}
ChannelQueuePtr
BRPCClient
::
GetChannel
(
const
std
::
string
&
ep
)
{
VLOG
(
4
)
<<
"begin to GetChannel:"
<<
ep
;
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
chan_mutex_
);
auto
it
=
channels_
.
find
(
ep
);
if
(
it
!=
channels_
.
end
())
{
VLOG
(
4
)
<<
"end to GetChannel:"
<<
ep
;
return
it
->
second
;
}
}
...
...
@@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
ChannelQueuePtr
q
(
new
framework
::
BlockingQueue
<
ChannelContextPtr
>
());
brpc
::
ChannelOptions
options
;
#ifdef PADDLE_WITH_BRPC_RDMA
options
.
use_rdma
=
true
;
#endif
options
.
protocol
=
"baidu_std"
;
options
.
connection_type
=
"pooled"
;
options
.
connect_timeout_ms
=
100
;
// don't use pooled type. the server can't afford that.
options
.
connection_type
=
"single"
;
options
.
connect_timeout_ms
=
1000
;
options
.
timeout_ms
=
FLAGS_timeout_ms
/*milliseconds*/
;
options
.
max_retry
=
FLAGS_max_retry
;
for
(
int
i
=
0
;
i
<
FLAGS_brpc_channel_num
;
++
i
)
{
VLOG
(
1
)
<<
"create "
<<
brpc_channel_num_per_server_
<<
" brpc channels to pserver:"
<<
ep
;
for
(
int
i
=
0
;
i
<
brpc_channel_num_per_server_
;
++
i
)
{
std
::
shared_ptr
<
ChannelContext
>
c
(
new
ChannelContext
());
if
(
c
->
channel
.
Init
(
ep
.
c_str
(),
&
options
)
!=
0
)
{
LOG
(
FATAL
)
<<
"Fail to initialize channel"
;
...
...
@@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
channels_
[
ep
]
=
q
;
}
VLOG
(
4
)
<<
"end to GetChannel:"
<<
ep
;
return
q
;
}
VarHandlePtr
BRPCClient
::
AsyncSendComplete
(
const
std
::
string
&
ep
,
int64_t
time_out
)
{
return
AsyncSendMessage
(
ep
,
"SendCompleteRPC"
,
COMPLETE_MESSAGE
,
time_out
);
}
void
BRPCClient
::
SendComplete
()
{
for
(
auto
&
kv
:
channels_
)
{
AsyncSendComplete
(
kv
.
first
);
}
}
VarHandlePtr
BRPCClient
::
AsyncSendVarMessage
(
const
std
::
string
&
ep
,
const
std
::
string
&
method_name
,
const
sendrecv
::
VariableMessage
&
req
,
int64_t
time_out
)
{
auto
ch_ptr
=
GetChannel
(
ep
);
auto
ch_ctx
=
ch_ptr
->
Pop
();
brpc
::
Controller
*
cntl
=
new
brpc
::
Controller
();
sendrecv
::
VoidMessage
*
response
=
new
sendrecv
::
VoidMessage
();
cntl
->
set_timeout_ms
(
time_out
);
platform
::
RecordRPCEvent
record_event
(
method_name
,
nullptr
);
VarHandlePtr
var_h
(
new
VarHandle
(
ep
,
method_name
,
req
.
varname
(),
nullptr
,
nullptr
));
google
::
protobuf
::
Closure
*
done
=
brpc
::
NewCallback
(
&
HandleSendResponse
,
cntl
,
response
,
var_h
,
ch_ptr
,
ch_ctx
,
this
);
if
(
method_name
==
"CheckPointNotifyRPC"
)
{
ch_ctx
->
stub
->
CheckpointNotify
(
cntl
,
&
req
,
response
,
done
);
}
else
if
(
method_name
==
"GetMonomerBarrier"
)
{
ch_ctx
->
stub
->
GetMonomerBarrier
(
cntl
,
&
req
,
response
,
done
);
}
else
{
ch_ctx
->
stub
->
SendVariable
(
cntl
,
&
req
,
response
,
done
);
}
req_count_
++
;
if
(
UNLIKELY
(
platform
::
IsProfileEnabled
()))
{
var_h
->
Wait
();
}
return
var_h
;
}
VarHandlePtr
BRPCClient
::
AsyncSendMessage
(
const
std
::
string
&
ep
,
const
std
::
string
&
method_name
,
const
std
::
string
&
message
,
int64_t
time_out
)
{
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
message
);
return
AsyncSendVarMessage
(
ep
,
method_name
,
req
,
time_out
);
}
VarHandlePtr
BRPCClient
::
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
)
{
sendrecv
::
VariableMessage
req
;
req
.
set_varname
(
CHECKPOINT_SAVE_MESSAGE
);
req
.
set_out_varname
(
dir
);
return
AsyncSendVarMessage
(
ep
,
"CheckPointNotifyRPC"
,
req
,
time_out
);
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/brpc_client.h
浏览文件 @
740e1626
...
...
@@ -31,6 +31,8 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
...
...
@@ -53,33 +55,94 @@ class BRPCClient : public RPCClient {
BRPCClient
()
{}
virtual
~
BRPCClient
();
bool
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
VarHandlePtr
AsyncSendVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncGetMonomerBarrier
(
const
std
::
string
&
ep
,
const
std
::
string
&
var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
bool
AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
VarHandlePtr
AsyncGetMonomerVariable
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
bool
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
VarHandlePtr
AsyncPrefetchVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
in_var_name
,
const
std
::
string
&
out_var_name
,
const
std
::
string
&
table_name
=
""
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncSendBatchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
VarHandlePtr
AsyncSendFetchBarrier
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
AsyncSendFetchBarrier
(
const
std
::
string
&
ep
,
VarHandlePtr
AsyncCheckpointNotify
(
const
std
::
string
&
ep
,
const
std
::
string
&
dir
,
int64_t
time_out
=
FLAGS_rpc_deadline
)
override
;
void
Wait
()
override
;
bool
Wait
()
override
;
void
SendComplete
()
override
;
private:
VarHandlePtr
_AsyncGetVar
(
const
std
::
string
&
ep
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
&
scope
,
const
std
::
string
&
var_name
,
const
std
::
string
&
method_name
,
int64_t
time_out
=
FLAGS_rpc_deadline
);
void
Proceed
();
ChannelQueuePtr
GetChannel
(
const
std
::
string
&
ep
);
VarHandlePtr
AsyncSendComplete
(
const
std
::
string
&
ep
,
int64_t
time_out
=
FLAGS_rpc_deadline
);
VarHandlePtr
AsyncSendMessage
(
const
std
::
string
&
ep
,
const
std
::
string
&
method_name
,
const
std
::
string
&
message
,
int64_t
time_out
);
VarHandlePtr
AsyncSendVarMessage
(
const
std
::
string
&
ep
,
const
std
::
string
&
method_name
,
const
sendrecv
::
VariableMessage
&
req
,
int64_t
time_out
);
friend
void
HandleSendResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VoidMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
);
friend
void
HandleGetResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VariableMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
);
friend
void
HandleFetchBarrierResponse
(
brpc
::
Controller
*
cntl
,
sendrecv
::
VariableMessage
*
response
,
VarHandlePtr
var_h
,
ChannelQueuePtr
ch_ptr
,
ChannelContextPtr
ch_ctx
,
BRPCClient
*
cls
);
void
DecreaseReqCount
()
{
if
(
--
req_count_
<=
0
)
{
sync_cond_
.
notify_all
();
}
}
private:
std
::
unordered_map
<
std
::
string
,
ChannelQueuePtr
>
channels_
;
...
...
@@ -88,6 +151,8 @@ class BRPCClient : public RPCClient {
std
::
condition_variable
sync_cond_
;
std
::
atomic
<
int64_t
>
req_count_
{
0
};
static
constexpr
int
brpc_channel_num_per_server_
=
4
;
// mutex for GetChannel thread safety
std
::
mutex
chan_mutex_
;
DISABLE_COPY_AND_ASSIGN
(
BRPCClient
);
...
...
paddle/fluid/operators/distributed/brpc_rdma_pool.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_BRPC_RDMA
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
#include "brpc/channel.h"
#include "brpc/rdma/rdma_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
RdmaMemPool
&
RdmaMemPool
::
Instance
()
{
static
RdmaMemPool
*
g_rdma_mem_pool
=
new
RdmaMemPool
();
return
*
g_rdma_mem_pool
;
}
void
*
RdmaMemPool
::
Find
(
const
std
::
string
&
varname
,
int64_t
size
)
{
pthread_rwlock_rdlock
(
&
access_
);
auto
it
=
pool_
.
find
(
varname
);
if
(
it
==
pool_
.
end
())
{
pthread_rwlock_unlock
(
&
access_
);
return
nullptr
;
}
auto
info
=
it
->
second
;
if
(
info
.
data_size
!=
size
)
{
pthread_rwlock_unlock
(
&
access_
);
PADDLE_ENFORCE
(
false
,
"var:%s size:%ld != %ld"
,
varname
,
size
,
info
.
data_size
);
return
nullptr
;
}
pthread_rwlock_unlock
(
&
access_
);
return
info
.
data
;
}
void
RdmaMemPool
::
Register
(
const
std
::
string
&
varname
,
void
*
data
,
int64_t
data_size
)
{
void
*
old
=
Find
(
varname
,
data_size
);
if
(
old
!=
nullptr
)
{
if
(
data
!=
old
)
{
PADDLE_ENFORCE
(
false
,
"var:%s data:%ld != %ld"
,
varname
,
data
,
old
);
}
VLOG
(
7
)
<<
"Find on rdma:"
<<
varname
<<
" data:"
<<
data
<<
" data_size:"
<<
data_size
;
return
;
}
VarInfo
info
;
info
.
data
=
data
;
info
.
data_size
=
data_size
;
pthread_rwlock_wrlock
(
&
access_
);
pool_
[
varname
]
=
info
;
pthread_rwlock_unlock
(
&
access_
);
if
(
brpc
::
rdma
::
RegisterMemoryForRdma
(
data
,
data_size
))
{
LOG
(
FATAL
)
<<
"register "
<<
varname
<<
" data:"
<<
data
<<
" data_size:"
<<
data_size
<<
" error"
;
}
VLOG
(
4
)
<<
"register on rdma:"
<<
varname
<<
" data:"
<<
data
<<
" data_size:"
<<
data_size
;
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/distributed/brpc_rdma_pool.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_BRPC_RDMA
#include <pthread.h> // NOLINT
#include <string>
#include <unordered_map>
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
/*
* This class is used to avoid duplicated registion of brpc::rdma.
*/
class
RdmaMemPool
{
public:
static
RdmaMemPool
&
Instance
();
RdmaMemPool
()
:
access_
(
PTHREAD_RWLOCK_INITIALIZER
)
{}
virtual
~
RdmaMemPool
()
{
pthread_rwlock_destroy
(
&
access_
);
}
void
Register
(
const
std
::
string
&
varname
,
void
*
data
,
int64_t
size
);
void
*
Find
(
const
std
::
string
&
varname
,
int64_t
size
);
private:
struct
VarInfo
{
void
*
data
;
int64_t
data_size
;
VarInfo
()
:
data
(
nullptr
),
data_size
(
0
)
{}
};
private:
std
::
unordered_map
<
std
::
string
,
VarInfo
>
pool_
;
pthread_rwlock_t
access_
;
};
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
#endif
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#endif
#include <sys/time.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/platform/profiler.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
class
IOBufWriter
{
public:
static
void
Append
(
butil
::
IOBuf
*
iobuf
,
int
k
,
const
char
*
v
,
int64_t
vlen
)
{
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
k
),
4
);
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
vlen
),
8
);
iobuf
->
append
(
v
,
vlen
);
}
static
void
AppendTCPZeroCopy
(
butil
::
IOBuf
*
iobuf
,
int
k
,
const
char
*
v
,
int64_t
vlen
,
bool
in_cuda_pinned
,
void
(
*
destroy
)(
void
*
),
void
*
user_data
)
{
VLOG
(
7
)
<<
"AppendTCPZeroCopy "
<<
" k:"
<<
k
<<
" data:"
<<
static_cast
<
void
*>
(
const_cast
<
char
*>
(
v
))
<<
" data_size:"
<<
vlen
<<
" in_cuda_pinned:"
<<
in_cuda_pinned
;
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
k
),
4
);
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
vlen
),
8
);
// FIXME(gongwb): use append_zerocopy
/*
if (in_cuda_pinned) {
iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
} else {
iobuf->append_zerocopy(v, vlen, nullptr);
}
*/
iobuf
->
append
(
v
,
vlen
);
destroy
(
user_data
);
}
#ifdef PADDLE_WITH_BRPC_RDMA
static
void
AppendRdmaZeroCopy
(
const
std
::
string
varname
,
butil
::
IOBuf
*
iobuf
,
int
k
,
const
char
*
v
,
int64_t
vlen
,
bool
in_cuda_pinned
,
void
(
*
destroy
)(
void
*
),
void
*
user_data
)
{
VLOG
(
7
)
<<
"AppendRdmaZeroCopy varname:"
<<
varname
<<
" k:"
<<
k
<<
" data:"
<<
static_cast
<
void
*>
(
const_cast
<
char
*>
(
v
))
<<
" data_size:"
<<
vlen
<<
" in_cuda_pinned:"
<<
in_cuda_pinned
;
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
k
),
4
);
iobuf
->
append
(
reinterpret_cast
<
char
*>
(
&
vlen
),
8
);
RdmaMemPool
::
Instance
().
Register
(
varname
,
static_cast
<
void
*>
(
const_cast
<
char
*>
(
v
)),
vlen
);
// FIXME(gongwb): use append_zerocopy
// iobuf->append_zerocopy(v, vlen, nullptr);
iobuf
->
append
(
v
,
vlen
);
destroy
(
user_data
);
return
;
}
#endif
static
void
AppendZeroCopy
(
const
std
::
string
varname
,
butil
::
IOBuf
*
iobuf
,
int
k
,
const
char
*
v
,
int64_t
vlen
,
bool
in_cuda_pinned
,
void
(
*
destroy
)(
void
*
),
void
*
user_data
)
{
#ifdef PADDLE_WITH_BRPC_RDMA
IOBufWriter
::
AppendRdmaZeroCopy
(
varname
,
iobuf
,
k
,
v
,
vlen
,
in_cuda_pinned
,
destroy
,
user_data
);
#else
IOBufWriter
::
AppendTCPZeroCopy
(
iobuf
,
k
,
v
,
vlen
,
in_cuda_pinned
,
destroy
,
user_data
);
#endif
}
};
void
SerializeToIOBuf
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
,
butil
::
IOBuf
*
iobuf
,
const
std
::
string
&
out_varname
,
bool
var_is_not_stable
,
int
trainer_id
,
const
std
::
string
&
table_name
)
{
std
::
unique_ptr
<
TensorPayload
>
payload
;
request
->
set_varname
(
name
);
request
->
set_trainer_id
(
trainer_id
);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if
(
platform
::
ShouldSendProfileState
())
{
if
(
platform
::
IsProfileEnabled
())
{
request
->
set_profile
(
platform
::
kEnableProfiler
);
}
else
{
request
->
set_profile
(
platform
::
kDisableProfiler
);
}
}
if
(
!
out_varname
.
empty
())
{
request
->
set_out_varname
(
out_varname
);
}
if
(
!
table_name
.
empty
())
{
request
->
set_table_name
(
table_name
);
}
if
(
var
->
IsType
<
framework
::
LoDTensor
>
())
{
request
->
set_type
(
::
sendrecv
::
LOD_TENSOR
);
payload
.
reset
(
new
TensorPayload
(
GetTensorPayload
(
var
,
ctx
,
request
)));
}
else
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
request
->
set_type
(
::
sendrecv
::
SELECTED_ROWS
);
payload
.
reset
(
new
TensorPayload
(
GetSelectedRowsPayload
(
var
,
ctx
,
request
)));
#ifdef PADDLE_WITH_CUDA
}
else
if
(
var
->
IsType
<
ncclUniqueId
>
())
{
request
->
set_type
(
::
sendrecv
::
NCCL_ID
);
const
ncclUniqueId
&
uid
=
var
->
Get
<
ncclUniqueId
>
();
// TODO(gongwb): use append_zero to avoid data copy.
IOBufWriter
::
Append
(
iobuf
,
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
,
uid
.
internal
,
NCCL_UNIQUE_ID_BYTES
);
return
;
#endif
}
else
{
PADDLE_THROW
(
"Serialize does not support type: %s"
,
typeid
(
var
->
Type
()).
name
());
}
PADDLE_ENFORCE_NOT_NULL
(
payload
);
// FIXME(gongwb): it seems that can use zero copy.
if
(
var_is_not_stable
)
{
IOBufWriter
::
Append
(
iobuf
,
::
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
,
static_cast
<
const
char
*>
(
payload
->
ptr
()),
payload
->
memory_size
());
}
else
{
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
#ifdef PADDLE_WITH_CUDA
IOBufWriter
::
AppendZeroCopy
(
name
,
iobuf
,
::
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
,
static_cast
<
const
char
*>
(
payload
->
ptr
()),
payload
->
memory_size
(),
true
,
SerializeDestroyCallback
,
static_cast
<
void
*>
(
payload
.
get
()));
payload
.
release
();
#endif
}
else
{
IOBufWriter
::
AppendZeroCopy
(
name
,
iobuf
,
::
sendrecv
::
VariableMessage
::
kSerializedFieldNumber
,
static_cast
<
const
char
*>
(
payload
->
ptr
()),
payload
->
memory_size
(),
false
,
SerializeDestroyCallback
,
static_cast
<
void
*>
(
payload
.
get
()));
payload
.
release
();
}
}
if
(
var
->
IsType
<
framework
::
SelectedRows
>
())
{
auto
*
slr
=
var
->
GetMutable
<
framework
::
SelectedRows
>
();
size_t
rows_memory_size
=
slr
->
rows
().
size
()
*
framework
::
SizeOfType
(
typeid
(
int64_t
));
IOBufWriter
::
Append
(
iobuf
,
::
sendrecv
::
VariableMessage
::
kRowsFieldNumber
,
reinterpret_cast
<
const
char
*>
(
slr
->
rows
().
data
()),
static_cast
<
int64_t
>
(
rows_memory_size
));
}
}
void
DeserializeFromIOBuf
(
const
::
sendrecv
::
VariableMessage
&
meta
,
const
butil
::
IOBuf
&
iobuf
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
*
scope
,
framework
::
Variable
**
var
,
int
*
trainer_id
)
{
operators
::
distributed
::
BRPCVariableResponse
resp
(
scope
,
&
ctx
);
PADDLE_ENFORCE
(
resp
.
Parse
(
iobuf
,
meta
)
==
0
,
"parse iobuf to tensor error!"
);
*
var
=
resp
.
GetVar
();
*
trainer_id
=
resp
.
GetTrainerId
();
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sys/time.h>
#include <iostream>
#include <string>
#include <vector>
#include "brpc/channel.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
void
SerializeToIOBuf
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
,
butil
::
IOBuf
*
iobuf
,
const
std
::
string
&
out_varname
,
bool
var_is_not_stable
,
const
int
trainer_id
=
0
,
const
std
::
string
&
table_name
=
std
::
string
());
void
DeserializeFromIOBuf
(
const
VarMsg
&
meta
,
const
butil
::
IOBuf
&
iobuf
,
const
platform
::
DeviceContext
&
ctx
,
const
framework
::
Scope
*
scope
,
framework
::
Variable
**
var
,
int
*
trainer_id
);
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/brpc_serde_test.cc
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread> // NOLINT
#include "brpc/channel.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
namespace
framework
=
paddle
::
framework
;
namespace
platform
=
paddle
::
platform
;
namespace
operators
=
paddle
::
operators
;
namespace
math
=
paddle
::
operators
::
math
;
namespace
memory
=
paddle
::
memory
;
void
RunSerdeTestSelectedRows
(
platform
::
Place
place
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
butil
::
IOBuf
iobuf
;
sendrecv
::
VariableMessage
msg
;
int
tensor_numel
=
564
*
128
;
// serialize var to IOBuf
{
framework
::
Variable
var
;
auto
*
slr
=
var
.
GetMutable
<
framework
::
SelectedRows
>
();
slr
->
set_height
(
1000
);
auto
*
tensor
=
slr
->
mutable_value
();
auto
*
rows
=
slr
->
mutable_rows
();
tensor
->
Resize
(
framework
::
make_ddim
({
564
,
128
}));
tensor
->
mutable_data
<
float
>
(
place
);
math
::
set_constant
(
ctx
,
tensor
,
32.7
);
for
(
int
i
=
0
;
i
<
564
;
++
i
)
rows
->
push_back
(
i
);
operators
::
distributed
::
SerializeToIOBuf
(
"myvar"
,
&
var
,
ctx
,
&
msg
,
&
iobuf
,
""
,
false
);
}
// desrialize
{
framework
::
Scope
scope
;
scope
.
Var
(
"myvar"
);
operators
::
distributed
::
BRPCVariableResponse
resp
(
&
scope
,
&
ctx
);
EXPECT_EQ
(
resp
.
Parse
(
iobuf
,
msg
),
0
);
framework
::
Variable
*
var2
=
resp
.
GetVar
();
auto
*
slr2
=
var2
->
GetMutable
<
framework
::
SelectedRows
>
();
auto
*
tensor2
=
slr2
->
mutable_value
();
auto
*
rows2
=
slr2
->
mutable_rows
();
float
*
tensor_data2
=
nullptr
;
framework
::
Tensor
tmp_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
CPUPlace
cpu
;
framework
::
TensorCopy
(
*
tensor2
,
cpu
,
&
tmp_tensor
);
tensor_data2
=
tmp_tensor
.
data
<
float
>
();
}
else
{
tensor_data2
=
const_cast
<
float
*>
(
tensor2
->
data
<
float
>
());
}
const
int64_t
*
rows_data2
=
rows2
->
data
();
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
{
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
32.7
);
}
for
(
size_t
i
=
0
;
i
<
rows2
->
size
();
++
i
)
{
EXPECT_EQ
(
rows_data2
[
i
],
static_cast
<
int64_t
>
(
i
));
}
EXPECT_EQ
(
slr2
->
height
(),
1000
);
}
}
void
RunTestLodTensor
(
platform
::
Place
place
)
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
auto
&
ctx
=
*
pool
.
Get
(
place
);
// serialize var to ByteBuffer
butil
::
IOBuf
iobuf
;
sendrecv
::
VariableMessage
msg
;
int
tensor_numel
=
512
*
8
*
4
*
2
;
{
framework
::
Variable
var
;
auto
*
tensor
=
var
.
GetMutable
<
framework
::
LoDTensor
>
();
tensor
->
Resize
(
framework
::
make_ddim
({
512
,
8
,
4
,
2
}));
framework
::
LoD
lod
;
lod
.
push_back
(
framework
::
Vector
<
size_t
>
({
1
,
3
,
8
}));
tensor
->
set_lod
(
lod
);
tensor
->
mutable_data
<
float
>
(
place
);
math
::
set_constant
(
ctx
,
tensor
,
31.9
);
operators
::
distributed
::
SerializeToIOBuf
(
"myvar"
,
&
var
,
ctx
,
&
msg
,
&
iobuf
,
""
,
false
);
}
// check sendrecv::VariableMessage meta data
{
EXPECT_EQ
(
msg
.
varname
(),
"myvar"
);
EXPECT_EQ
(
msg
.
type
(),
0
);
EXPECT_EQ
(
msg
.
dims
()[
0
],
512
);
EXPECT_EQ
(
msg
.
dims
()[
1
],
8
);
EXPECT_EQ
(
msg
.
dims
()[
2
],
4
);
EXPECT_EQ
(
msg
.
dims
()[
3
],
2
);
EXPECT_EQ
(
msg
.
lod_level
(),
1
);
EXPECT_EQ
(
msg
.
lod
(
0
).
lod_data
(
0
),
1
);
EXPECT_EQ
(
msg
.
lod
(
0
).
lod_data
(
1
),
3
);
EXPECT_EQ
(
msg
.
lod
(
0
).
lod_data
(
2
),
8
);
}
// deserialize
{
framework
::
Scope
scope
;
scope
.
Var
(
"myvar"
);
operators
::
distributed
::
BRPCVariableResponse
resp
(
&
scope
,
&
ctx
);
EXPECT_EQ
(
resp
.
Parse
(
iobuf
,
msg
),
0
);
framework
::
Variable
*
var2
=
resp
.
GetVar
();
auto
tensor2
=
var2
->
Get
<
framework
::
LoDTensor
>
();
float
*
tensor_data2
=
nullptr
;
framework
::
Tensor
tmp_tensor
;
if
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()))
{
platform
::
CPUPlace
cpu
;
framework
::
TensorCopy
(
tensor2
,
cpu
,
&
tmp_tensor
);
tensor_data2
=
tmp_tensor
.
data
<
float
>
();
}
else
{
tensor_data2
=
const_cast
<
float
*>
(
tensor2
.
data
<
float
>
());
}
for
(
int
i
=
0
;
i
<
tensor_numel
;
++
i
)
EXPECT_FLOAT_EQ
(
tensor_data2
[
i
],
31.9
);
}
}
TEST
(
LodTensor
,
Run
)
{
platform
::
CPUPlace
place
;
RunTestLodTensor
(
place
);
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu
(
0
);
RunTestLodTensor
(
gpu
);
#endif
}
TEST
(
SelectedRows
,
Run
)
{
platform
::
CPUPlace
place
;
RunSerdeTestSelectedRows
(
place
);
#ifdef PADDLE_WITH_CUDA
platform
::
CUDAPlace
gpu
;
RunSerdeTestSelectedRows
(
gpu
);
#endif
}
paddle/fluid/operators/distributed/brpc_server.cc
浏览文件 @
740e1626
...
...
@@ -13,84 +13,287 @@
// limitations under the License.
#include "paddle/fluid/operators/distributed/brpc_server.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
namespace
sendrecv
{
typedef
std
::
unordered_map
<
std
::
string
,
paddle
::
operators
::
distributed
::
RequestHandler
*>
namespace
distributed
=
paddle
::
operators
::
distributed
;
typedef
std
::
unordered_map
<
std
::
string
,
distributed
::
RequestHandler
*>
HandlerMap
;
class
BRPCServiceImpl
:
public
SendRecvService
{
public:
explicit
BRPCServiceImpl
(
const
HandlerMap
&
rpc_call_map
)
:
request_send_h_
(
nullptr
),
request_get_h_
(
nullptr
),
request_prefetch_h_
(
nullptr
)
{
auto
it
=
rpc_call_map
.
find
(
paddle
::
operators
::
distributed
::
kRequestSend
);
explicit
BRPCServiceImpl
(
const
HandlerMap
&
rpc_call_map
,
distributed
::
RPCServer
*
rpc_server
)
:
rpc_server_
(
rpc_server
)
{
VLOG
(
3
)
<<
"BRPCServiceImpl size: "
<<
rpc_call_map
.
size
();
auto
it
=
rpc_call_map
.
find
(
distributed
::
kRequestSend
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_send_h_
=
it
->
second
;
send_threads_
.
reset
(
new
paddle
::
framework
::
ThreadPool
(
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestSend
)));
}
it
=
rpc_call_map
.
find
(
paddle
::
operators
::
distributed
::
kRequestSend
);
it
=
rpc_call_map
.
find
(
distributed
::
kRequestGet
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_get_h_
=
it
->
second
;
get_threads_
.
reset
(
new
paddle
::
framework
::
ThreadPool
(
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestGet
)));
}
it
=
rpc_call_map
.
find
(
paddle
::
operators
::
distributed
::
kRequestPrefetch
);
it
=
rpc_call_map
.
find
(
distributed
::
kRequestPrefetch
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_prefetch_h_
=
it
->
second
;
prefetch_threads_
.
reset
(
new
paddle
::
framework
::
ThreadPool
(
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestPrefetch
)));
}
it
=
rpc_call_map
.
find
(
distributed
::
kRequestCheckpoint
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_checkpoint_h_
=
it
->
second
;
checkpoint_notify_threads_
.
reset
(
new
paddle
::
framework
::
ThreadPool
(
rpc_server_
->
GetThreadNum
(
distributed
::
kRequestPrefetch
)));
}
virtual
~
BRPCServiceImpl
()
{}
it
=
rpc_call_map
.
find
(
distributed
::
kRequestGetMonomerVariable
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_get_monomer_handler_h_
=
it
->
second
;
}
it
=
rpc_call_map
.
find
(
distributed
::
kRequestGetMonomerBarrier
);
if
(
it
!=
rpc_call_map
.
end
())
{
request_get_monomer_barrier_handler_h_
=
it
->
second
;
}
}
virtual
~
BRPCServiceImpl
()
{}
void
SendVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VoidMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
send_threads_
->
Run
(
[
=
]
{
_SendVariable
(
cntl_butil
,
request
,
response
,
done
);
});
}
void
_SendVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VoidMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
PADDLE_ENFORCE
(
request_send_h_
!=
nullptr
,
"RequestSend handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
paddle
::
framework
::
Scope
*
local_scope
=
request_send_h_
->
scope
();
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
paddle
::
framework
::
Variable
*
invar
=
nullptr
;
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
std
::
string
varname
=
request
->
varname
();
VLOG
(
3
)
<<
"RequestSend var_name:"
<<
varname
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
if
(
!
request_send_h_
->
sync_mode
())
{
local_scope
=
&
request_send_h_
->
scope
()
->
NewScope
();
invar
=
local_scope
->
Var
(
varname
);
}
else
{
invar
=
local_scope
->
FindVar
(
varname
);
}
distributed
::
BRPCVariableResponse
resp
(
request_send_h_
->
scope
(),
request_send_h_
->
dev_ctx
(),
!
request_send_h_
->
sync_mode
());
PADDLE_ENFORCE
(
resp
.
Parse
(
cntl
->
request_attachment
(),
*
request
)
==
0
,
"parse iobuf to tensor error!"
);
request_send_h_
->
Handle
(
varname
,
local_scope
,
invar
,
&
outvar
);
auto
scope
=
resp
.
GetMutableLocalScope
();
auto
invar
=
resp
.
GetVar
();
int
trainer_id
=
request
->
trainer_id
();
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
if
(
!
request_send_h_
->
sync_mode
())
{
request_send_h_
->
scope
()
->
DeleteScope
(
local_scope
);
}
request_send_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
trainer_id
);
}
void
GetVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
get_threads_
->
Run
(
[
=
]
{
_GetVariable
(
cntl_butil
,
request
,
response
,
done
);
});
}
void
_GetVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
PADDLE_ENFORCE
(
request_get_h_
!=
nullptr
,
"RequestGet handler should be registed first!"
);
}
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
std
::
string
varname
=
request
->
varname
();
VLOG
(
3
)
<<
"RequestGet varname:"
<<
varname
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
auto
scope
=
request_get_h_
->
scope
();
auto
invar
=
scope
->
FindVar
(
varname
);
int
trainer_id
=
request
->
trainer_id
();
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
request_get_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
trainer_id
);
if
(
outvar
)
{
distributed
::
SerializeToIOBuf
(
varname
,
outvar
,
*
request_get_h_
->
dev_ctx
(),
response
,
&
cntl
->
response_attachment
(),
""
,
false
);
}
}
void
PrefetchVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
prefetch_threads_
->
Run
(
[
=
]
{
_PrefetchVariable
(
cntl_butil
,
request
,
response
,
done
);
});
}
void
_PrefetchVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
PADDLE_ENFORCE
(
request_prefetch_h_
!=
nullptr
,
"kRequestPrefetch handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
// prefetch process...
std
::
string
in_var_name
=
request
->
varname
();
std
::
string
out_var_name
=
request
->
out_varname
();
VLOG
(
3
)
<<
"RequestPrefetch, in_var_name: "
<<
in_var_name
<<
", out_var_name: "
<<
out_var_name
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
distributed
::
BRPCVariableResponse
resp
(
request_prefetch_h_
->
scope
(),
request_prefetch_h_
->
dev_ctx
(),
true
);
PADDLE_ENFORCE
(
resp
.
Parse
(
cntl
->
request_attachment
(),
*
request
)
==
0
,
"parse iobuf to tensor error!"
);
auto
scope
=
resp
.
GetMutableLocalScope
();
auto
invar
=
scope
->
FindVar
(
in_var_name
);
std
::
string
table_name
=
request
->
table_name
();
int
trainer_id
=
request
->
trainer_id
();
paddle
::
framework
::
Variable
*
outvar
=
scope
->
Var
(
out_var_name
);
request_prefetch_h_
->
Handle
(
in_var_name
,
scope
,
invar
,
&
outvar
,
trainer_id
,
out_var_name
,
table_name
);
distributed
::
SerializeToIOBuf
(
out_var_name
,
outvar
,
*
request_prefetch_h_
->
dev_ctx
(),
response
,
&
cntl
->
response_attachment
(),
""
,
true
);
}
void
CheckpointNotify
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VoidMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
checkpoint_notify_threads_
->
Run
(
[
=
]
{
_CheckpointNotify
(
cntl_butil
,
request
,
response
,
done
);
});
}
void
_CheckpointNotify
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VoidMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
{
PADDLE_ENFORCE
(
request_checkpoint_h_
!=
nullptr
,
"kRequestCheckpointNotify handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
distributed
::
BRPCVariableResponse
resp
(
request_checkpoint_h_
->
scope
(),
request_checkpoint_h_
->
dev_ctx
());
auto
scope
=
resp
.
GetMutableLocalScope
();
std
::
string
checkpoint_notify
=
request
->
varname
();
std
::
string
checkpoint_dir
=
request
->
out_varname
();
int
trainer_id
=
request
->
trainer_id
();
VLOG
(
4
)
<<
"RequestCheckpointNotify notify: "
<<
checkpoint_notify
<<
", dir: "
<<
checkpoint_dir
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
request_checkpoint_h_
->
Handle
(
checkpoint_notify
,
scope
,
nullptr
,
nullptr
,
trainer_id
,
checkpoint_dir
);
}
void
GetMonomerVariable
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VariableMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
PADDLE_ENFORCE
(
request_get_monomer_handler_h_
!=
nullptr
,
"kRequestGetMonomerVariable handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
// proc request.
std
::
string
varname
=
request
->
varname
();
VLOG
(
3
)
<<
"GetMonomerVariable "
<<
varname
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
rpc_server_
->
WaitVarCond
(
varname
);
distributed
::
MonomerHandle
h
=
rpc_server_
->
GetMonomer
(
varname
);
auto
scope
=
h
.
scope_
;
auto
invar
=
scope
->
FindVar
(
varname
);
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
request_get_monomer_handler_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
request
->
trainer_id
());
if
(
outvar
)
{
distributed
::
SerializeToIOBuf
(
varname
,
outvar
,
*
h
.
dev_ctx_
,
response
,
&
cntl
->
response_attachment
(),
""
,
false
);
}
}
void
GetMonomerBarrier
(
google
::
protobuf
::
RpcController
*
cntl_butil
,
const
VariableMessage
*
request
,
VoidMessage
*
response
,
google
::
protobuf
::
Closure
*
done
)
override
{
PADDLE_ENFORCE
(
request_get_monomer_barrier_handler_h_
!=
nullptr
,
"RequestGetMonomerBarrier handler should be registed first!"
);
brpc
::
ClosureGuard
done_guard
(
done
);
brpc
::
Controller
*
cntl
=
static_cast
<
brpc
::
Controller
*>
(
cntl_butil
);
std
::
string
varname
=
request
->
varname
();
VLOG
(
3
)
<<
"RequestGetMonomerBarrier var_name:"
<<
varname
<<
", trainer_id:"
<<
request
->
trainer_id
()
<<
", from:"
<<
cntl
->
remote_side
();
rpc_server_
->
WaitVarCond
(
varname
);
distributed
::
MonomerHandle
h
=
rpc_server_
->
GetMonomer
(
varname
);
paddle
::
framework
::
Scope
*
scope
=
nullptr
;
paddle
::
framework
::
Variable
*
invar
=
nullptr
;
paddle
::
framework
::
Variable
*
outvar
=
nullptr
;
request_get_monomer_barrier_handler_h_
->
Handle
(
varname
,
scope
,
invar
,
&
outvar
,
request
->
trainer_id
());
}
private:
paddle
::
operators
::
distributed
::
RequestHandler
*
request_send_h_
;
paddle
::
operators
::
distributed
::
RequestHandler
*
request_get_h_
;
paddle
::
operators
::
distributed
::
RequestHandler
*
request_prefetch_h_
;
distributed
::
RequestHandler
*
request_send_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_prefetch_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_checkpoint_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_monomer_handler_h_
{
nullptr
};
distributed
::
RequestHandler
*
request_get_monomer_barrier_handler_h_
{
nullptr
};
distributed
::
RPCServer
*
rpc_server_
{
nullptr
};
// FIXME(gongwb): brpc should support process one rpce use one threadpool.
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
send_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
get_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
prefetch_threads_
;
std
::
unique_ptr
<
paddle
::
framework
::
ThreadPool
>
checkpoint_notify_threads_
;
};
}
// namespace sendrecv
...
...
@@ -100,7 +303,7 @@ namespace distributed {
void
AsyncBRPCServer
::
StartServer
()
{
// Instance of your service.
sendrecv
::
BRPCServiceImpl
service_impl
(
rpc_call_map_
);
sendrecv
::
BRPCServiceImpl
service_impl
(
rpc_call_map_
,
this
);
// Add the service into server. Notice the second parameter, because the
// service is put on stack, we don't want server to delete it, otherwise
...
...
@@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() {
}
brpc
::
ServerOptions
options
;
#ifdef PADDLE_WITH_BRPC_RDMA
options
.
use_rdma
=
true
;
#endif
options
.
idle_timeout_sec
=
idle_timeout_s_
;
options
.
max_concurrency
=
max_concurrency_
;
if
(
server_
.
Start
(
bind_address_
.
c_str
(),
&
options
)
!=
0
)
{
...
...
paddle/fluid/operators/distributed/brpc_variable_response.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
namespace
pb
=
::
google
::
protobuf
;
using
vr
=
::
sendrecv
::
VariableMessage
;
int
BRPCVariableResponse
::
Parse
(
Source
*
source
)
{
pb
::
io
::
ZeroCopyInputStream
*
input_stream
=
source
->
contents
();
pb
::
io
::
CodedInputStream
input
(
input_stream
);
input
.
SetTotalBytesLimit
(
INT_MAX
,
INT_MAX
);
while
(
1
)
{
unsigned
int
tag
=
0
;
if
(
!
input
.
ReadLittleEndian32
(
&
tag
))
{
break
;
}
uint64_t
num_bytes
=
0
;
if
(
!
input
.
ReadLittleEndian64
(
&
num_bytes
))
{
break
;
}
int
field
=
static_cast
<
int
>
(
tag
);
int
ret
=
field
==
0
?
-
1
:
field
;
switch
(
field
)
{
case
vr
::
kSerializedFieldNumber
:
{
if
(
!
ProcSerializedField
(
field
,
&
input
,
num_bytes
))
{
return
ret
;
}
break
;
}
case
vr
::
kRowsFieldNumber
:
{
PADDLE_ENFORCE
((
meta_
.
type
()
==
sendrecv
::
SELECTED_ROWS
||
meta_
.
type
()
==
sendrecv
::
LOD_TENSOR
)
&&
meta_
.
varname
()
!=
""
,
"meta info should be got first!"
);
if
(
!
CopySelectRowsData
(
&
input
,
*
dev_ctx_
,
num_bytes
))
{
return
ret
;
}
break
;
}
default:
{
PADDLE_ENFORCE
(
false
,
"not surpported %u fieldnumber"
,
field
);
return
ret
;
}
}
}
return
0
;
}
}
// namespace distributed
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/distributed/brpc_variable_response.h
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "brpc/channel.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
namespace
paddle
{
namespace
operators
{
namespace
distributed
{
class
BRPCSourceWrapper
:
public
Source
{
public:
explicit
BRPCSourceWrapper
(
const
butil
::
IOBuf
&
iobuf
)
:
source_
(
iobuf
)
{}
::
google
::
protobuf
::
io
::
ZeroCopyInputStream
*
contents
()
override
{
return
&
source_
;
}
private:
butil
::
IOBufAsZeroCopyInputStream
source_
;
};
class
BRPCVariableResponse
:
public
VariableResponse
{
public:
BRPCVariableResponse
(
const
framework
::
Scope
*
scope
,
const
platform
::
DeviceContext
*
dev_ctx
,
bool
create_scope
=
false
)
:
VariableResponse
(
scope
,
dev_ctx
,
create_scope
)
{}
virtual
~
BRPCVariableResponse
()
{}
// parse attachment from iobuf
int
Parse
(
Source
*
source
)
override
;
int
Parse
(
const
butil
::
IOBuf
&
iobuf
,
const
sendrecv
::
VariableMessage
&
meta
)
{
BRPCSourceWrapper
wrapper
(
iobuf
);
return
VariableResponse
::
Parse
(
&
wrapper
,
meta
);
}
};
};
// namespace distributed
};
// namespace operators
};
// namespace paddle
paddle/fluid/operators/distributed/grpc_client.cc
浏览文件 @
740e1626
...
...
@@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
const
auto
ch
=
GetChannel
(
ep
);
BatchBarrierProcessor
*
s
=
new
BatchBarrierProcessor
(
ch
);
const
std
::
string
method
=
"SendMonomerFetchBarrierRPC"
;
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
FETCH_BARRIER_MESSAGE
,
nullptr
,
nullptr
));
VarHandlePtr
h
(
new
VarHandle
(
ep
,
method
,
var_name
,
nullptr
,
nullptr
));
s
->
Prepare
(
h
,
time_out
);
VLOG
(
30
)
<<
s
->
GetVarHandlePtr
()
->
String
()
<<
" begin"
;
...
...
paddle/fluid/operators/distributed/grpc_serde.cc
浏览文件 @
740e1626
...
...
@@ -32,13 +32,6 @@ namespace paddle {
namespace
operators
{
namespace
distributed
{
static
void
SerializeDestroyCallback
(
void
*
payload
)
{
if
(
payload
!=
nullptr
)
{
auto
*
shared_payload
=
reinterpret_cast
<
TensorPayload
*>
(
payload
);
delete
shared_payload
;
}
}
void
SerializeToByteBuffer
(
const
std
::
string
&
name
,
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
::
grpc
::
ByteBuffer
*
msg
,
const
std
::
string
&
out_name
,
...
...
paddle/fluid/operators/distributed/rpc_server.h
浏览文件 @
740e1626
...
...
@@ -75,6 +75,10 @@ class RPCServer {
void
RegisterRPC
(
const
std
::
string
&
rpc_name
,
RequestHandler
*
handler
,
int
thread_num
=
5
);
int
GetThreadNum
(
const
std
::
string
&
rpc_name
)
{
return
rpc_thread_num_
[
rpc_name
];
}
// Wait util all the clients have reached the barrier for one
// rpc method. This function should be called in the
// RequestHandler if you want to run the server/client in a
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.cc
浏览文件 @
740e1626
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <thread> // NOLINT
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/port.h"
...
...
@@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor(
memory
::
Copy
(
cuda_pinned
,
result
->
ptr
(),
boost
::
get
<
platform
::
CUDAPlace
>
(
tensor
.
place
()),
tensor
.
data
<
void
>
(),
copy_size
,
gpu_dev_ctx
.
stream
());
ctx
.
Wait
();
return
TensorPayload
(
result
);
#else
...
...
paddle/fluid/operators/distributed/sendrecvop_utils.h
浏览文件 @
740e1626
...
...
@@ -50,6 +50,13 @@ class TensorPayload final {
size_t
memory_size_
;
};
inline
void
SerializeDestroyCallback
(
void
*
payload
)
{
if
(
payload
!=
nullptr
)
{
auto
*
shared_payload
=
reinterpret_cast
<
TensorPayload
*>
(
payload
);
delete
shared_payload
;
}
}
TensorPayload
GetTensorPayload
(
framework
::
Variable
*
var
,
const
platform
::
DeviceContext
&
ctx
,
VarMsg
*
request
);
...
...
paddle/fluid/operators/distributed_ops/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -2,9 +2,9 @@ include(operators)
set
(
DISTRIBUTE_DEPS
""
)
if
(
WITH_GRPC
)
set
(
DISTRIBUTE_DEPS sendrecvop_
g
rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node
)
set
(
DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node
)
else
()
set
(
DISTRIBUTE_DEPS sendrecvop_
b
rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node
)
set
(
DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node
)
if
(
WITH_BRPC_RDMA
)
find_library
(
IBVERBS_LIBRARY NAMES ibverbs
)
ADD_LIBRARY
(
ibverbs SHARED IMPORTED GLOBAL
)
...
...
paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
浏览文件 @
740e1626
...
...
@@ -26,10 +26,11 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h"
DEFINE_int32
(
rpc_send_thread_num
,
5
,
"number of threads for rpc send"
);
DEFINE_int32
(
rpc_get_thread_num
,
5
,
"number of threads for rpc get"
);
DEFINE_int32
(
rpc_prefetch_thread_num
,
5
,
"number of threads for rpc prefetch"
);
DEFINE_int32
(
rpc_send_thread_num
,
12
,
"number of threads for rpc send"
);
DEFINE_int32
(
rpc_get_thread_num
,
12
,
"number of threads for rpc get"
);
DEFINE_int32
(
rpc_prefetch_thread_num
,
12
,
"number of threads for rpc prefetch"
);
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/distributed_ops/send_op.cc
浏览文件 @
740e1626
...
...
@@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase {
}
if
(
sync_send
)
{
for
(
size_t
i
=
0
;
i
<
rets
.
size
();
i
++
)
{
VLOG
(
7
)
<<
"before sync_send "
<<
ins
[
i
]
<<
"from "
<<
epmap
[
i
];
PADDLE_ENFORCE
(
rets
[
i
]
->
Wait
(),
"internal error in RPCClient"
);
VLOG
(
7
)
<<
"after sync_send "
<<
ins
[
i
]
<<
"from "
<<
epmap
[
i
];
}
}
}
...
...
paddle/fluid/operators/math/pooling.cc
浏览文件 @
740e1626
...
...
@@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -51,16 +51,28 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
T
*
input_data
=
input
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
}
T
ele
=
pool_process
.
initial
();
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
...
...
@@ -68,7 +80,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
);
}
}
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
int
pool_size
=
(
exclusive
||
adaptive
)
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
ele
;
...
...
@@ -94,7 +107,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -115,17 +128,30 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
}
int
pool_size
=
(
exclusive
||
adaptive
)
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
float
scale
=
1.0
/
pool_size
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
...
...
@@ -251,7 +277,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -276,20 +302,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
T
*
input_data
=
input
.
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
dstart
,
dend
;
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
pd
=
0
;
pd
<
output_depth
;
++
pd
)
{
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
if
(
adaptive
)
{
dstart
=
AdaptStartIndex
(
pd
,
input_depth
,
output_depth
);
dend
=
AdaptEndIndex
(
pd
,
input_depth
,
output_depth
);
}
else
{
dstart
=
pd
*
stride_depth
-
padding_depth
;
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
dstart
=
std
::
max
(
dstart
,
0
);
}
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
}
int
output_idx
=
(
pd
*
output_height
+
ph
)
*
output_width
+
pw
;
T
ele
=
pool_process
.
initial
();
for
(
int
d
=
dstart
;
d
<
dend
;
++
d
)
{
...
...
@@ -302,7 +346,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
}
}
int
pool_size
=
exclusive
(
exclusive
||
adaptive
)
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
...
...
@@ -330,7 +374,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -356,24 +400,41 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
T
*
output_grad_data
=
output_grad
.
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
context
.
GetPlace
());
int
dstart
,
dend
;
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
pd
=
0
;
pd
<
output_depth
;
++
pd
)
{
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
if
(
adaptive
)
{
dstart
=
AdaptStartIndex
(
pd
,
input_depth
,
output_depth
);
dend
=
AdaptEndIndex
(
pd
,
input_depth
,
output_depth
);
}
else
{
dstart
=
pd
*
stride_depth
-
padding_depth
;
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
dstart
=
std
::
max
(
dstart
,
0
);
}
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
}
int
pool_size
=
exclusive
(
exclusive
||
adaptive
)
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
float
scale
=
1.0
/
pool_size
;
...
...
@@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -538,16 +599,28 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
T1
*
output_data
=
output
->
mutable_data
<
T1
>
(
context
.
GetPlace
());
T2
*
mask_data
=
mask
->
mutable_data
<
T2
>
(
context
.
GetPlace
());
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
}
T1
ele
=
static_cast
<
T1
>
(
-
FLT_MAX
);
int
index
=
-
1
;
...
...
@@ -584,7 +657,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input_grad
->
dims
()[
0
];
const
int
input_height
=
input_grad
->
dims
()[
2
];
...
...
@@ -637,8 +710,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -663,20 +736,38 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
T1
*
output_data
=
output
->
mutable_data
<
T1
>
(
context
.
GetPlace
());
T2
*
mask_data
=
mask
->
mutable_data
<
T2
>
(
context
.
GetPlace
());
int
dstart
,
dend
;
int
hstart
,
hend
;
int
wstart
,
wend
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
for
(
int
pd
=
0
;
pd
<
output_depth
;
++
pd
)
{
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
if
(
adaptive
)
{
dstart
=
AdaptStartIndex
(
pd
,
input_depth
,
output_depth
);
dend
=
AdaptEndIndex
(
pd
,
input_depth
,
output_depth
);
}
else
{
dstart
=
pd
*
stride_depth
-
padding_depth
;
dend
=
std
::
min
(
dstart
+
ksize_depth
,
input_depth
);
dstart
=
std
::
max
(
dstart
,
0
);
}
for
(
int
ph
=
0
;
ph
<
output_height
;
++
ph
)
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
std
::
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
std
::
max
(
hstart
,
0
);
}
for
(
int
pw
=
0
;
pw
<
output_width
;
++
pw
)
{
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
if
(
adaptive
)
{
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
}
int
output_idx
=
(
pd
*
output_height
+
ph
)
*
output_width
+
pw
;
T1
ele
=
static_cast
<
T1
>
(
-
FLT_MAX
);
...
...
@@ -718,7 +809,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input_grad
->
dims
()[
0
];
const
int
input_depth
=
input_grad
->
dims
()[
2
];
...
...
paddle/fluid/operators/math/pooling.cu
浏览文件 @
740e1626
...
...
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
bool
exclusive
,
T
*
output_data
)
{
bool
exclusive
,
bool
adaptive
,
T
*
output_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -37,13 +37,23 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
int
c
=
(
index
/
output_width
/
output_height
)
%
channels
;
int
batch_idx
=
index
/
output_width
/
output_height
/
channels
;
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
int
hstart
,
hend
;
int
wstart
,
wend
;
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
max
(
hstart
,
0
);
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
max
(
wstart
,
0
);
}
input_data
+=
(
batch_idx
*
channels
+
c
)
*
input_height
*
input_width
;
T
ele
=
pool_process
.
initial
();
...
...
@@ -52,7 +62,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
);
}
}
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
int
pool_size
=
(
exclusive
||
adaptive
)
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
index
]
=
ele
;
...
...
@@ -66,22 +76,33 @@ __global__ void KernelPool2DGrad(
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
bool
exclusive
,
T
*
input_grad
)
{
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
T
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
offsetW
=
index
%
input_width
+
padding_width
;
int
offsetH
=
(
index
/
input_width
)
%
input_height
+
padding_height
;
int
w_offset
=
index
%
input_width
+
padding_width
;
int
h_offset
=
(
index
/
input_width
)
%
input_height
+
padding_height
;
int
offsetC
=
(
index
/
input_width
/
input_height
)
%
channels
;
int
batch_idx
=
index
/
input_width
/
input_height
/
channels
;
int
phstart
=
(
offsetH
<
ksize_height
)
int
phstart
,
phend
;
int
pwstart
,
pwend
;
if
(
adaptive
)
{
phstart
=
h_offset
*
output_height
/
input_height
;
phend
=
min
((
h_offset
+
1
)
*
output_height
/
input_height
+
1
,
output_height
);
pwstart
=
w_offset
*
output_width
/
input_width
;
pwend
=
min
((
w_offset
+
1
)
*
output_width
/
input_width
+
1
,
output_width
);
}
else
{
phstart
=
(
h_offset
<
ksize_height
)
?
0
:
(
offsetH
-
ksize_height
)
/
stride_height
+
1
;
int
pwstart
=
(
offsetW
<
ksize_width
)
:
(
h_offset
-
ksize_height
)
/
stride_height
+
1
;
pwstart
=
(
w_offset
<
ksize_width
)
?
0
:
(
offsetW
-
ksize_width
)
/
stride_width
+
1
;
int
phend
=
min
(
offsetH
/
stride_height
+
1
,
output_height
);
int
pwend
=
min
(
offsetW
/
stride_width
+
1
,
output_width
);
:
(
w_offset
-
ksize_width
)
/
stride_width
+
1
;
phend
=
min
(
h_offset
/
stride_height
+
1
,
output_height
);
pwend
=
min
(
w_offset
/
stride_width
+
1
,
output_width
);
}
T
gradient
=
0
;
T
input
=
input_data
[
index
];
int
output_idx
=
...
...
@@ -90,14 +111,22 @@ __global__ void KernelPool2DGrad(
output_grad
+=
output_idx
;
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
int
pool_size
;
if
(
adaptive
)
{
pool_size
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
input_height
)
/
ksize_height
))
*
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
input_width
)
/
ksize_width
));
}
else
{
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
}
int
output_sub_idx
=
ph
*
output_width
+
pw
;
pool_process
.
compute
(
input
,
output_data
[
output_sub_idx
],
output_grad
[
output_sub_idx
],
...
...
@@ -181,7 +210,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
KernelPool2D
<
PoolProcess
,
T
><<<
grid
,
threads
,
0
,
stream
>>>
(
nthreads
,
input
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_compute
,
exclusive
,
output
);
padding_height
,
padding_width
,
pool_compute
,
exclusive
,
false
,
output
);
}
/*
...
...
@@ -196,7 +225,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -223,7 +252,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
nthreads
,
input_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
output_data
);
adaptive
,
output_data
);
}
};
...
...
@@ -242,7 +271,8 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -270,7 +300,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
nthreads
,
input_data
,
output_data
,
output_grad_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
input_grad_data
);
pool_process
,
exclusive
,
adaptive
,
input_grad_data
);
}
};
...
...
@@ -359,7 +389,7 @@ __global__ void KernelPool3D(
const
int
ksize_depth
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
bool
exclusive
,
T
*
output_data
)
{
PoolProcess
pool_process
,
bool
exclusive
,
bool
adaptive
,
T
*
output_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -368,15 +398,30 @@ __global__ void KernelPool3D(
int
c
=
(
index
/
output_width
/
output_height
/
output_depth
)
%
channels
;
int
batch_idx
=
index
/
output_width
/
output_height
/
output_depth
/
channels
;
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
dend
=
min
(
dstart
+
ksize_depth
,
input_depth
);
int
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
int
dstart
,
dend
;
int
hstart
,
hend
;
int
wstart
,
wend
;
if
(
adaptive
)
{
dstart
=
AdaptStartIndex
(
pd
,
input_depth
,
output_depth
);
dend
=
AdaptEndIndex
(
pd
,
input_depth
,
output_depth
);
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
dstart
=
pd
*
stride_depth
-
padding_depth
;
hstart
=
ph
*
stride_height
-
padding_height
;
wstart
=
pw
*
stride_width
-
padding_width
;
dend
=
min
(
dstart
+
ksize_depth
,
input_depth
);
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
dstart
=
max
(
dstart
,
0
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
}
T
ele
=
pool_process
.
initial
();
input_data
+=
(
batch_idx
*
channels
+
c
)
*
input_depth
*
input_height
*
input_width
;
...
...
@@ -388,7 +433,7 @@ __global__ void KernelPool3D(
}
}
}
int
pool_size
=
exclusive
int
pool_size
=
(
exclusive
||
adaptive
)
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
...
...
@@ -405,28 +450,43 @@ __global__ void KernelPool3DGrad(
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
bool
exclusive
,
T
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
T
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
offsetW
=
index
%
input_width
+
padding_width
;
int
offsetH
=
(
index
/
input_width
)
%
input_height
+
padding_height
;
int
offsetD
=
int
w_offset
=
index
%
input_width
+
padding_width
;
int
h_offset
=
(
index
/
input_width
)
%
input_height
+
padding_height
;
int
d_offset
=
(
index
/
input_width
/
input_height
)
%
input_depth
+
padding_depth
;
int
offsetC
=
(
index
/
input_width
/
input_height
/
input_depth
)
%
channels
;
int
batch_idx
=
index
/
input_width
/
input_height
/
input_depth
/
channels
;
int
pdstart
=
(
offsetD
<
ksize_depth
)
int
pdstart
,
pdend
;
int
phstart
,
phend
;
int
pwstart
,
pwend
;
if
(
adaptive
)
{
pdstart
=
d_offset
*
output_depth
/
input_depth
;
pdend
=
min
((
d_offset
+
1
)
*
output_depth
/
input_depth
+
1
,
output_depth
);
phstart
=
h_offset
*
output_height
/
input_height
;
phend
=
min
((
h_offset
+
1
)
*
output_height
/
input_height
+
1
,
output_height
);
pwstart
=
w_offset
*
output_width
/
input_width
;
pwend
=
min
((
w_offset
+
1
)
*
output_width
/
input_width
+
1
,
output_width
);
}
else
{
pdstart
=
(
d_offset
<
ksize_depth
)
?
0
:
(
offsetD
-
ksize_depth
)
/
stride_depth
+
1
;
int
phstart
=
(
offsetH
<
ksize_height
)
:
(
d_offset
-
ksize_depth
)
/
stride_depth
+
1
;
phstart
=
(
h_offset
<
ksize_height
)
?
0
:
(
offsetH
-
ksize_height
)
/
stride_height
+
1
;
int
pwstart
=
(
offsetW
<
ksize_width
)
:
(
h_offset
-
ksize_height
)
/
stride_height
+
1
;
pwstart
=
(
w_offset
<
ksize_width
)
?
0
:
(
offsetW
-
ksize_width
)
/
stride_width
+
1
;
int
pdend
=
min
((
offsetD
)
/
stride_depth
+
1
,
output_depth
);
int
phend
=
min
((
offsetH
)
/
stride_height
+
1
,
output_height
);
int
pwend
=
min
((
offsetW
)
/
stride_width
+
1
,
output_width
);
:
(
w_offset
-
ksize_width
)
/
stride_width
+
1
;
pdend
=
min
((
d_offset
)
/
stride_depth
+
1
,
output_depth
);
phend
=
min
((
h_offset
)
/
stride_height
+
1
,
output_height
);
pwend
=
min
((
w_offset
)
/
stride_width
+
1
,
output_width
);
}
T
gradient
=
0
;
T
input
=
input_data
[
index
];
...
...
@@ -439,6 +499,16 @@ __global__ void KernelPool3DGrad(
for
(
int
ph
=
phstart
;
ph
<
phend
;
++
ph
)
{
for
(
int
pw
=
pwstart
;
pw
<
pwend
;
++
pw
)
{
// figure out the pooling size
int
pool_size
;
if
(
adaptive
)
{
pool_size
=
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
input_depth
)
/
ksize_depth
))
*
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
input_height
)
/
ksize_height
))
*
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
(
input_width
)
/
ksize_width
));
}
else
{
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
wstart
=
pw
*
stride_width
-
padding_width
;
...
...
@@ -448,9 +518,10 @@ __global__ void KernelPool3DGrad(
dstart
=
max
(
dstart
,
0
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
int
pool_size
=
pool_size
=
exclusive
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
}
int
output_sub_idx
=
(
pd
*
output_height
+
ph
)
*
output_width
+
pw
;
pool_process
.
compute
(
input
,
output_data
[
output_sub_idx
],
output_grad
[
output_sub_idx
],
...
...
@@ -525,7 +596,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -559,7 +630,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
output_data
);
adaptive
,
output_data
);
}
};
...
...
@@ -578,7 +649,8 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -614,7 +686,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
input_grad_data
);
padding_width
,
pool_process
,
exclusive
,
adaptive
,
input_grad_data
);
}
};
...
...
@@ -703,7 +775,7 @@ __global__ void KernelMaxPool2dWithIdx(
const
int
input_height
,
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
T1
*
output_data
,
T2
*
mask_data
)
{
const
int
padding_width
,
bool
adaptive
,
T1
*
output_data
,
T2
*
mask_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -711,13 +783,23 @@ __global__ void KernelMaxPool2dWithIdx(
int
c
=
(
index
/
output_width
/
output_height
)
%
channels
;
int
batch_idx
=
index
/
output_width
/
output_height
/
channels
;
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
int
hstart
,
hend
;
int
wstart
,
wend
;
if
(
adaptive
)
{
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
hstart
=
ph
*
stride_height
-
padding_height
;
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
hstart
=
max
(
hstart
,
0
);
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
pw
*
stride_width
-
padding_width
;
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
max
(
wstart
,
0
);
}
input_data
+=
(
batch_idx
*
channels
+
c
)
*
input_height
*
input_width
;
T1
ele
=
-
FLT_MAX
;
...
...
@@ -742,36 +824,47 @@ __global__ void KernelMaxPool2DWithIdxGrad(
const
int
channels
,
const
int
input_height
,
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
T1
*
input_grad
)
{
const
int
padding_height
,
const
int
padding_width
,
bool
adaptive
,
T1
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
w_offset
=
index
%
input_width
;
int
h_offset
=
(
index
/
input_width
)
%
input_height
;
int
c_offset
=
(
index
/
input_width
/
input_height
)
%
channels
;
int
offsetC
=
(
index
/
input_width
/
input_height
)
%
channels
;
int
batch_idx
=
index
/
input_width
/
input_height
/
channels
;
int
ph_start
=
int
phstart
,
phend
;
int
pwstart
,
pwend
;
if
(
adaptive
)
{
phstart
=
h_offset
*
output_height
/
input_height
;
phend
=
min
((
h_offset
+
1
)
*
output_height
/
input_height
+
1
,
output_height
);
pwstart
=
w_offset
*
output_width
/
input_width
;
pwend
=
min
((
w_offset
+
1
)
*
output_width
/
input_width
+
1
,
output_width
);
}
else
{
phstart
=
(
h_offset
+
padding_height
<
ksize_height
)
?
0
:
(
h_offset
+
padding_height
-
ksize_height
)
/
stride_height
+
1
;
int
pw_
start
=
pw
start
=
(
w_offset
+
padding_width
<
ksize_width
)
?
0
:
(
w_offset
+
padding_width
-
ksize_width
)
/
stride_width
+
1
;
int
ph_
end
=
ph
end
=
min
((
h_offset
+
padding_height
)
/
stride_height
+
1
,
output_height
);
int
pw_end
=
min
((
w_offset
+
padding_width
)
/
stride_width
+
1
,
output_width
);
pwend
=
min
((
w_offset
+
padding_width
)
/
stride_width
+
1
,
output_width
);
}
T1
gradient
=
0
;
int
input_current_featuremap_idx
=
h_offset
*
input_width
+
w_offset
;
int
output_idx
=
(
batch_idx
*
channels
+
c_offset
)
*
output_height
*
output_width
;
(
batch_idx
*
channels
+
offsetC
)
*
output_height
*
output_width
;
mask_data
+=
output_idx
;
output_grad
+=
output_idx
;
for
(
int
ph
=
ph
_start
;
ph
<
ph_
end
;
++
ph
)
{
for
(
int
pw
=
pw
_start
;
pw
<
pw_
end
;
++
pw
)
{
for
(
int
ph
=
ph
start
;
ph
<
ph
end
;
++
ph
)
{
for
(
int
pw
=
pw
start
;
pw
<
pw
end
;
++
pw
)
{
if
(
mask_data
[
ph
*
output_width
+
pw
]
==
input_current_featuremap_idx
)
gradient
+=
output_grad
[
ph
*
output_width
+
pw
];
}
...
...
@@ -791,8 +884,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -819,7 +912,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
KernelMaxPool2dWithIdx
<
T1
,
T2
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
input_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
output_data
,
mask_data
);
stride_width
,
padding_height
,
padding_width
,
adaptive
,
output_data
,
mask_data
);
}
};
...
...
@@ -835,7 +929,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input_grad
->
dims
()[
0
];
const
int
input_channels
=
input_grad
->
dims
()[
1
];
...
...
@@ -862,7 +956,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
KernelMaxPool2DWithIdxGrad
<
T1
,
T2
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
output_grad_data
,
mask_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
adaptive
,
input_grad_data
);
}
};
...
...
@@ -884,7 +978,7 @@ __global__ void KernelMaxPool3DWithIdx(
const
int
ksize_depth
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
T1
*
output_data
,
T2
*
mask_data
)
{
bool
adaptive
,
T1
*
output_data
,
T2
*
mask_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -894,15 +988,29 @@ __global__ void KernelMaxPool3DWithIdx(
int
batch_idx
=
index
/
output_width
/
output_height
/
output_depth
/
channels
;
int
dstart
=
pd
*
stride_depth
-
padding_depth
;
int
hstart
=
ph
*
stride_height
-
padding_height
;
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
dend
=
min
(
dstart
+
ksize_depth
,
input_depth
);
int
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
int
dstart
,
dend
;
int
hstart
,
hend
;
int
wstart
,
wend
;
if
(
adaptive
)
{
dstart
=
AdaptStartIndex
(
pd
,
input_depth
,
output_depth
);
dend
=
AdaptEndIndex
(
pd
,
input_depth
,
output_depth
);
hstart
=
AdaptStartIndex
(
ph
,
input_height
,
output_height
);
hend
=
AdaptEndIndex
(
ph
,
input_height
,
output_height
);
wstart
=
AdaptStartIndex
(
pw
,
input_width
,
output_width
);
wend
=
AdaptEndIndex
(
pw
,
input_width
,
output_width
);
}
else
{
dstart
=
pd
*
stride_depth
-
padding_depth
;
hstart
=
ph
*
stride_height
-
padding_height
;
wstart
=
pw
*
stride_width
-
padding_width
;
dend
=
min
(
dstart
+
ksize_depth
,
input_depth
);
hend
=
min
(
hstart
+
ksize_height
,
input_height
);
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
dstart
=
max
(
dstart
,
0
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
}
T1
ele
=
-
FLT_MAX
;
int
max_index
=
-
1
;
...
...
@@ -932,46 +1040,58 @@ __global__ void KernelMaxPool3DWithIdxGrad(
const
int
output_width
,
const
int
ksize_depth
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
T1
*
input_grad
)
{
const
int
padding_width
,
bool
adaptive
,
T1
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
w_offset
=
index
%
input_width
;
int
h_offset
=
(
index
/
input_width
)
%
input_height
;
int
d_offset
=
(
index
/
input_width
/
input_height
)
%
input_depth
;
int
c_offset
=
(
index
/
input_width
/
input_height
/
input_depth
)
%
channels
;
int
offsetC
=
(
index
/
input_width
/
input_height
/
input_depth
)
%
channels
;
int
batch_idx
=
index
/
input_width
/
input_height
/
input_depth
/
channels
;
int
pd_start
=
int
pdstart
,
pdend
;
int
phstart
,
phend
;
int
pwstart
,
pwend
;
if
(
adaptive
)
{
pdstart
=
d_offset
*
output_depth
/
input_depth
;
pdend
=
min
((
d_offset
+
1
)
*
output_depth
/
input_depth
+
1
,
output_depth
);
phstart
=
h_offset
*
output_height
/
input_height
;
phend
=
min
((
h_offset
+
1
)
*
output_height
/
input_height
+
1
,
output_height
);
pwstart
=
w_offset
*
output_width
/
input_width
;
pwend
=
min
((
w_offset
+
1
)
*
output_width
/
input_width
+
1
,
output_width
);
}
else
{
pdstart
=
(
d_offset
+
padding_depth
<
ksize_depth
)
?
0
:
(
d_offset
+
padding_depth
-
ksize_depth
)
/
stride_depth
+
1
;
int
ph_
start
=
ph
start
=
(
h_offset
+
padding_height
<
ksize_height
)
?
0
:
(
h_offset
+
padding_height
-
ksize_height
)
/
stride_height
+
1
;
int
pw_
start
=
pw
start
=
(
w_offset
+
padding_width
<
ksize_width
)
?
0
:
(
w_offset
+
padding_width
-
ksize_width
)
/
stride_width
+
1
;
int
pd_end
=
min
((
d_offset
+
padding_depth
)
/
stride_depth
+
1
,
output_depth
);
int
ph_end
=
pdend
=
min
((
d_offset
+
padding_depth
)
/
stride_depth
+
1
,
output_depth
);
phend
=
min
((
h_offset
+
padding_height
)
/
stride_height
+
1
,
output_height
);
int
pw_end
=
min
((
w_offset
+
padding_width
)
/
stride_width
+
1
,
output_width
);
pwend
=
min
((
w_offset
+
padding_width
)
/
stride_width
+
1
,
output_width
);
}
T1
gradient
=
0
;
int
input_current_feature_map_idx
=
(
d_offset
*
input_height
+
h_offset
)
*
input_width
+
w_offset
;
int
output_idx
=
(
batch_idx
*
channels
+
c_offset
)
*
output_depth
*
int
output_idx
=
(
batch_idx
*
channels
+
offsetC
)
*
output_depth
*
output_height
*
output_width
;
mask
+=
output_idx
;
output_grad
+=
output_idx
;
for
(
int
pd
=
pd
_start
;
pd
<
pd_
end
;
++
pd
)
{
for
(
int
ph
=
ph
_start
;
ph
<
ph_
end
;
++
ph
)
{
for
(
int
pw
=
pw
_start
;
pw
<
pw_
end
;
++
pw
)
{
for
(
int
pd
=
pd
start
;
pd
<
pd
end
;
++
pd
)
{
for
(
int
ph
=
ph
start
;
ph
<
ph
end
;
++
ph
)
{
for
(
int
pw
=
pw
start
;
pw
<
pw
end
;
++
pw
)
{
if
(
mask
[(
pd
*
output_height
+
ph
)
*
output_width
+
pw
]
==
input_current_feature_map_idx
)
gradient
+=
...
...
@@ -994,8 +1114,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -1029,7 +1149,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
nthreads
,
input_data
,
input_channels
,
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
output_data
,
mask_data
);
padding_depth
,
padding_height
,
padding_width
,
adaptive
,
output_data
,
mask_data
);
}
};
...
...
@@ -1045,7 +1166,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input_grad
->
dims
()[
0
];
const
int
input_channels
=
input_grad
->
dims
()[
1
];
...
...
@@ -1079,7 +1200,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
nthreads
,
output_grad_data
,
mask_data
,
input_channels
,
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
adaptive
,
input_grad_data
);
}
};
...
...
paddle/fluid/operators/math/pooling.h
浏览文件 @
740e1626
...
...
@@ -68,6 +68,18 @@ class AvgPoolGrad {
}
};
/* used for adaptive pool to calculate start and end index of each divided grid
*/
HOSTDEVICE
inline
int
AdaptStartIndex
(
int
ph
,
int
input_size
,
int
output_size
)
{
return
static_cast
<
int
>
(
floor
(
static_cast
<
double
>
(
ph
*
input_size
)
/
output_size
));
}
HOSTDEVICE
inline
int
AdaptEndIndex
(
int
ph
,
int
input_size
,
int
output_size
)
{
return
static_cast
<
int
>
(
ceil
(
static_cast
<
double
>
((
ph
+
1
)
*
input_size
)
/
output_size
));
}
/*
* \brief Getting pooling results, and calculating gradient.
*
...
...
@@ -102,7 +114,7 @@ class Pool2dFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
framework
::
Tensor
*
output
);
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -114,7 +126,7 @@ class Pool2dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
);
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
@@ -136,7 +148,7 @@ class Pool3dFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
framework
::
Tensor
*
output
);
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
output
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -148,7 +160,7 @@ class Pool3dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
bool
exclusive
,
framework
::
Tensor
*
input_grad
);
bool
exclusive
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
@@ -176,8 +188,8 @@ class MaxPool2dWithIndexFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
);
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
);
};
template
<
typename
DeviceContext
,
typename
T1
,
typename
T2
>
...
...
@@ -187,7 +199,7 @@ class MaxPool2dWithIndexGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
};
...
...
@@ -197,8 +209,8 @@ class MaxPool3dWithIndexFunctor {
void
operator
()(
const
DeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
);
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
mask
);
};
template
<
typename
DeviceContext
,
typename
T1
,
typename
T2
>
...
...
@@ -208,7 +220,7 @@ class MaxPool3dWithIndexGradFunctor {
const
framework
::
Tensor
&
output_grad
,
const
framework
::
Tensor
&
mask
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
paddings
,
bool
adaptive
,
framework
::
Tensor
*
input_grad
);
};
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
740e1626
...
...
@@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
std
::
vector
<
int
>
strides
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
ceil_mode
=
ctx
->
Attrs
().
Get
<
bool
>
(
"ceil_mode"
);
bool
adaptive
=
ctx
->
Attrs
().
Get
<
bool
>
(
"adaptive"
);
PADDLE_ENFORCE
(
in_x_dims
.
size
()
==
4
||
in_x_dims
.
size
()
==
5
,
"Pooling intput should be 4-D or 5-D tensor."
);
...
...
@@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
"Paddings size and pooling size should be the same."
);
std
::
vector
<
int64_t
>
output_shape
({
in_x_dims
[
0
],
in_x_dims
[
1
]});
if
(
adaptive
)
{
output_shape
.
insert
(
output_shape
.
end
(),
ksize
.
begin
(),
ksize
.
end
());
}
else
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
output_shape
.
push_back
(
PoolOutputSize
(
in_x_dims
[
i
+
2
],
ksize
[
i
],
paddings
[
i
],
strides
[
i
],
ceil_mode
));
output_shape
.
push_back
(
PoolOutputSize
(
in_x_dims
[
i
+
2
],
ksize
[
i
],
paddings
[
i
],
strides
[
i
],
ceil_mode
));
}
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_shape
));
ctx
->
ShareLoD
(
"X"
,
"Out"
);
...
...
@@ -185,6 +190,14 @@ void Pool2dOpMaker::Make() {
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"adaptive"
,
"(bool, default False) When true, will perform adaptive pooling instead, "
"output shape in H and W dimensions will be same as ksize, input data "
"will be divided into grids specify by ksize averagely and perform "
"pooling in each grid area to get output pooling value."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
...
...
@@ -263,6 +276,14 @@ Example:
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
hstart = floor(i * H_{in} / H_{out})
hend = ceil((i + 1) * H_{in} / H_{out})
wstart = floor(j * W_{in} / W_{out})
wend = ceil((j + 1) * W_{in} / W_{out})
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"
);
}
...
...
@@ -324,6 +345,13 @@ void Pool3dOpMaker::Make() {
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"adaptive"
,
"(bool, default False) When true, will perform adaptive pooling instead, "
"output shape in H and W dimensions will be same as ksize, input data "
"will be divided into grids specify by ksize averagely and perform "
"pooling in each grid area to get output pooling value."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_cudnn"
,
...
...
@@ -375,6 +403,37 @@ Example:
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
For exclusive = true:
$$
dstart = i * strides[0] - paddings[0]
dend = dstart + ksize[0]
hstart = j * strides[1] - paddings[1]
hend = hstart + ksize[1]
wstart = k * strides[2] - paddings[2]
wend = wstart + ksize[2]
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
$$
For exclusive = false:
$$
dstart = max(0, i * strides[0] - paddings[0])
dend = min(D, dstart + ksize[0])
hstart = max(0, j * strides[1] - paddings[1])
hend = min(H, hstart + ksize[1])
wstart = max(0, k * strides[2] - paddings[2])
wend = min(W, wstart + ksize[2])
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
For adaptive = true:
$$
dstart = floor(i * D_{in} / D_{out})
dend = ceil((i + 1) * D_{in} / D_{out})
hstart = floor(j * H_{in} / H_{out})
hend = ceil((j + 1) * H_{in} / H_{out})
wstart = floor(k * W_{in} / W_{out})
wend = ceil((k + 1) * W_{in} / W_{out})
Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
$$
)DOC"
);
}
...
...
paddle/fluid/operators/pool_op.h
浏览文件 @
740e1626
...
...
@@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
exclusive
=
context
.
Attr
<
bool
>
(
"exclusive"
);
bool
adaptive
=
context
.
Attr
<
bool
>
(
"adaptive"
);
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
paddings
[
i
]
=
0
;
...
...
@@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
true
,
out
);
true
,
false
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
paddle
::
operators
::
math
::
Pool2dFunctor
<
...
...
@@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
exclusive
,
out
);
exclusive
,
adaptive
,
out
);
}
}
break
;
case
3
:
{
...
...
@@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
true
,
out
);
true
,
false
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
paddle
::
operators
::
math
::
Pool3dFunctor
<
DeviceContext
,
paddle
::
operators
::
math
::
AvgPool
<
T
>
,
T
>
pool3d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
exclusive
,
out
);
exclusive
,
adaptive
,
out
);
}
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
...
...
@@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
exclusive
=
context
.
Attr
<
bool
>
(
"exclusive"
);
bool
adaptive
=
context
.
Attr
<
bool
>
(
"adaptive"
);
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
...
...
@@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool2d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool2d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
pool_process
,
exclusive
,
in_x_grad
);
paddings
,
pool_process
,
exclusive
,
adaptive
,
in_x_grad
);
}
}
break
;
case
3
:
{
...
...
@@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool3d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool3d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
pool_process
,
exclusive
,
in_x_grad
);
paddings
,
pool_process
,
exclusive
,
adaptive
,
in_x_grad
);
}
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
...
...
paddle/fluid/operators/pool_with_index_op.cc
浏览文件 @
740e1626
...
...
@@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
std
::
vector
<
int
>
ksize
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
adaptive
=
ctx
->
Attrs
().
Get
<
bool
>
(
"adaptive"
);
PADDLE_ENFORCE
(
in_x_dims
.
size
()
==
4
||
in_x_dims
.
size
()
==
5
,
"Pooling intput should be 4-D or 5-D tensor."
);
...
...
@@ -60,10 +61,14 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
"Paddings size and pooling size should be the same."
);
std
::
vector
<
int64_t
>
output_shape
({
in_x_dims
[
0
],
in_x_dims
[
1
]});
if
(
adaptive
)
{
output_shape
.
insert
(
output_shape
.
end
(),
ksize
.
begin
(),
ksize
.
end
());
}
else
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
output_shape
.
push_back
(
MaxPoolOutputSize
(
in_x_dims
[
i
+
2
],
ksize
[
i
],
paddings
[
i
],
strides
[
i
]));
}
}
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
output_shape
));
ctx
->
SetOutputDim
(
"Mask"
,
framework
::
make_ddim
(
output_shape
));
}
...
...
@@ -131,6 +136,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default:false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings will be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"adaptive"
,
"(bool, default False) When true, will perform adaptive pooling "
"instead, "
"output shape in H and W dimensions will be same as ksize, input data "
"will be divided into grids specify by ksize averagely and perform "
"pooling in each grid area to get output pooling value."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1, 1}), strides(height, "
"width) of pooling operator."
)
...
...
@@ -168,6 +181,12 @@ Example:
W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$
For adaptive = true:
$$
H_{out} = ksize[0] W_{out} = ksize[1]
$$
)DOC"
);
}
};
...
...
@@ -207,6 +226,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, default false) Whether to use the global pooling. "
"If global_pooling = true, ksize and paddings will be ignored."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"adaptive"
,
"(bool, default False) When true, will perform adaptive pooling "
"instead, "
"output shape in H and W dimensions will be same as ksize, input data "
"will be divided into grids specify by ksize averagely and perform "
"pooling in each grid area to get output pooling value."
)
.
SetDefault
(
false
);
AddAttr
<
std
::
vector
<
int
>>
(
"strides"
,
"(vector<int>, default {1,1,1}), strides(depth, "
"height, width) of pooling operator."
)
...
...
@@ -245,6 +272,11 @@ Example:
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For adaptive = true:
$$
D_{out} = ksize[0] H_{out} = ksize[1] W_{out} = ksize[2]
$$
)DOC"
);
}
};
...
...
paddle/fluid/operators/pool_with_index_op.h
浏览文件 @
740e1626
...
...
@@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
std
::
vector
<
int
>
ksize
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
adaptive
=
context
.
Attr
<
bool
>
(
"adaptive"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
DeviceContext
>();
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
...
...
@@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
paddle
::
operators
::
math
::
MaxPool2dWithIndexFunctor
<
DeviceContext
,
T1
,
T2
>
pool2d_forward
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
out
,
mask
);
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
adaptive
,
out
,
mask
);
}
break
;
case
3
:
{
paddle
::
operators
::
math
::
MaxPool3dWithIndexFunctor
<
DeviceContext
,
T1
,
T2
>
pool3d_forward
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
out
,
mask
);
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
adaptive
,
out
,
mask
);
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
}
...
...
@@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
std
::
vector
<
int
>
ksize
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
adaptive
=
context
.
Attr
<
bool
>
(
"adaptive"
);
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
paddings
[
i
]
=
0
;
...
...
@@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
T1
,
T2
>
pool2d_backward
;
pool2d_backward
(
device_ctx
,
*
out_grad
,
*
mask
,
ksize
,
strides
,
paddings
,
in_x_grad
);
paddings
,
adaptive
,
in_x_grad
);
}
break
;
case
3
:
{
paddle
::
operators
::
math
::
MaxPool3dWithIndexGradFunctor
<
DeviceContext
,
T1
,
T2
>
pool3d_backward
;
pool3d_backward
(
device_ctx
,
*
out_grad
,
*
mask
,
ksize
,
strides
,
paddings
,
in_x_grad
);
paddings
,
adaptive
,
in_x_grad
);
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
}
...
...
paddle/fluid/operators/psroi_pool_op.cc
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/psroi_pool_op.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
class
PSROIPoolOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor), "
"the input of PSROIPoolOp. "
"The format of input tensor is NCHW. Where N is the batch size, "
"C is the number of input channels, "
"H is the height of the input feature map, and "
"W is the width."
);
AddInput
(
"ROIs"
,
"(LoDTensor), "
"ROIs (Regions of Interest) to pool over. "
"should be a 2-D LoDTensor of shape (num_rois, 4) "
"given as [(x1, y1, x2, y2), ...]. "
"where (x1, y1) is the top left coordinates, and "
"(x2, y2) is the bottom right coordinates. "
"The roi batch index can be calculated from LoD."
);
AddOutput
(
"Out"
,
"(Tensor), "
"the output of PSROIPoolOp is a 4-D Tensor with shape "
"(num_rois, output_channels, pooled_h, pooled_w)."
);
AddAttr
<
int
>
(
"output_channels"
,
"(int), "
"the number of channels of the output feature map. "
"For a task of C classes of objects, output_channels should be "
"(C + 1) for classification only."
);
AddAttr
<
float
>
(
"spatial_scale"
,
"(float, default 1.0), "
"Multiplicative spatial scale factor "
"to translate ROI coords from their input scale "
"to the scale used when pooling."
)
.
SetDefault
(
1.0
);
AddAttr
<
int
>
(
"pooled_height"
,
"(int, default 1), "
"the pooled output height."
)
.
SetDefault
(
1
);
AddAttr
<
int
>
(
"pooled_width"
,
"(int, default 1), "
"the pooled output width."
)
.
SetDefault
(
1
);
AddComment
(
R"Doc(
**PSROIPool Operator**
Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
position-sensitive average pooling on regions of interest specified by input, takes as
input N position-sensitive score maps and a list of num_rois regions of interest.
PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
)Doc"
);
}
};
class
PSROIPoolOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of PSROIPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"ROIs"
),
"Input(ROIs) of PSROIPoolOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Out"
),
"Output(Out) of PSROIPoolOp should not be null."
);
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
rois_dims
=
ctx
->
GetInputDim
(
"ROIs"
);
PADDLE_ENFORCE
(
input_dims
.
size
()
==
4
,
"The format of input tensor is NCHW"
);
PADDLE_ENFORCE
(
rois_dims
.
size
()
==
2
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
"given as [(x1, y1, x2, y2), ...]"
);
PADDLE_ENFORCE
(
rois_dims
[
1
]
==
4
,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
"given as [(x1, y1, x2, y2), ...]"
);
int
pooled_height
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_height"
);
int
pooled_width
=
ctx
->
Attrs
().
Get
<
int
>
(
"pooled_width"
);
int
output_channels
=
ctx
->
Attrs
().
Get
<
int
>
(
"output_channels"
);
float
spatial_scale
=
ctx
->
Attrs
().
Get
<
float
>
(
"spatial_scale"
);
PADDLE_ENFORCE
(
input_dims
[
1
]
==
output_channels
*
pooled_height
*
pooled_width
,
"the channel of X(%d) should be equal to the product of "
"output_channels(%d), pooled_height(%d) and pooled_width(%d)"
,
input_dims
[
1
],
output_channels
,
pooled_height
,
pooled_width
);
PADDLE_ENFORCE_GT
(
pooled_height
,
0
,
"The pooled output height must be greater than 0"
);
PADDLE_ENFORCE_GT
(
pooled_width
,
0
,
"The pooled output width must be greater than 0"
);
PADDLE_ENFORCE_GT
(
output_channels
,
1
,
"The pooled output channels must greater than 1"
);
PADDLE_ENFORCE_GT
(
spatial_scale
,
0.0
f
,
"The spatial scale must greater than 0."
);
auto
out_dims
=
input_dims
;
out_dims
[
0
]
=
rois_dims
[
0
];
out_dims
[
1
]
=
output_channels
;
// input_dims[1] / (pooled_height * pooled_width);
out_dims
[
2
]
=
pooled_height
;
out_dims
[
3
]
=
pooled_width
;
ctx
->
SetOutputDim
(
"Out"
,
out_dims
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
class
PSROIPoolGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"The gradient of Out should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)),
"The gradient of X should not be null."
);
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
ctx
->
GetInputDim
(
"X"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
)
->
type
(),
ctx
.
device_context
());
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
psroi_pool
,
ops
::
PSROIPoolOp
,
ops
::
PSROIPoolOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
psroi_pool_grad
,
ops
::
PSROIPoolGradOp
);
REGISTER_OP_CPU_KERNEL
(
psroi_pool
,
ops
::
CPUPSROIPoolOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
CPUPSROIPoolOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
psroi_pool_grad
,
ops
::
CPUPSROIPoolGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
CPUPSROIPoolGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/psroi_pool_op.cu
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/psroi_pool_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
LoDTensor
=
framework
::
LoDTensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaximumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaximumNumBlocks
);
}
template
<
typename
T
>
__global__
void
GPUPSROIPoolForward
(
const
int
nthreads
,
const
T
*
input_data
,
const
T
*
input_rois
,
const
float
spatial_scale
,
const
int
input_channels
,
const
int
height
,
const
int
width
,
const
int
output_channels
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
*
rois_batch_id_data
,
T
*
output_data
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
size_t
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
// The output is in order (n, c, ph, pw)
int
pw
=
i
%
pooled_width
;
int
ph
=
(
i
/
pooled_width
)
%
pooled_height
;
int
c
=
(
i
/
pooled_width
/
pooled_height
)
%
output_channels
;
int
n
=
i
/
pooled_width
/
pooled_height
/
output_channels
;
// set roi_batch_id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
// [start, end) interval for spatial sampling
const
T
*
offset_input_rois
=
input_rois
+
n
*
4
;
T
roi_start_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
T
roi_start_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
T
roi_end_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
T
roi_end_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small ROIs to be 1x1
T
roi_height
=
max
(
roi_end_h
-
roi_start_h
,
(
T
)
0.1
);
// avoid 0
T
roi_width
=
max
(
roi_end_w
-
roi_start_w
,
(
T
)
0.1
);
// Compute w and h at input feature map
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
int
hstart
=
floor
(
bin_size_h
*
static_cast
<
T
>
(
ph
)
+
roi_start_h
);
int
wstart
=
floor
(
bin_size_w
*
static_cast
<
T
>
(
pw
)
+
roi_start_w
);
int
hend
=
ceil
(
bin_size_h
*
static_cast
<
T
>
(
ph
+
1
)
+
roi_start_h
);
int
wend
=
ceil
(
bin_size_w
*
static_cast
<
T
>
(
pw
+
1
)
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
min
(
max
(
hstart
,
0
),
height
);
hend
=
min
(
max
(
hend
,
0
),
height
);
wstart
=
min
(
max
(
wstart
,
0
),
width
);
wend
=
min
(
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
const
T
*
offset_input_data
=
input_data
+
(
roi_batch_id
*
input_channels
+
input_channel
)
*
height
*
width
;
T
outsum
=
0
;
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
width
+
iw
;
outsum
+=
offset_input_data
[
input_index
];
}
}
T
bin_area
=
static_cast
<
T
>
((
hend
-
hstart
)
*
(
wend
-
wstart
));
output_data
[
i
]
=
is_empty
?
0.
:
outsum
/
bin_area
;
}
}
template
<
typename
T
>
__global__
void
GPUPSROIPoolBackward
(
const
int
nthreads
,
const
T
*
input_rois
,
const
T
*
output_grad_data
,
const
float
spatial_scale
,
const
int
input_channels
,
const
int
height
,
const
int
width
,
const
int
output_channels
,
const
int
pooled_height
,
const
int
pooled_width
,
const
int
*
rois_batch_id_data
,
T
*
input_grad_data
)
{
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
offset
=
blockDim
.
x
*
gridDim
.
x
;
for
(
int
i
=
index
;
i
<
nthreads
;
i
+=
offset
)
{
// The output is in order (n, c, ph, pw)
int
pw
=
i
%
pooled_width
;
int
ph
=
(
i
/
pooled_width
)
%
pooled_height
;
int
c
=
(
i
/
pooled_width
/
pooled_height
)
%
output_channels
;
int
n
=
i
/
pooled_width
/
pooled_height
/
output_channels
;
// set roi_batch_id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
int
input_offset
=
(
roi_batch_id
*
input_channels
+
input_channel
)
*
height
*
width
;
T
*
offset_input_grad_data
=
input_grad_data
+
input_offset
;
// [start, end) interval for spatial sampling
const
T
*
offset_input_rois
=
input_rois
+
n
*
4
;
T
roi_start_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
T
roi_start_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
T
roi_end_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
T
roi_end_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small ROIs to be 1x1
T
roi_height
=
max
(
roi_end_h
-
roi_start_h
,
(
T
)
0.1
);
// avoid 0
T
roi_width
=
max
(
roi_end_w
-
roi_start_w
,
(
T
)
0.1
);
// Compute w and h at input feature map
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
int
hstart
=
floor
(
bin_size_h
*
static_cast
<
T
>
(
ph
)
+
roi_start_h
);
int
wstart
=
floor
(
bin_size_w
*
static_cast
<
T
>
(
pw
)
+
roi_start_w
);
int
hend
=
ceil
(
bin_size_h
*
static_cast
<
T
>
(
ph
+
1
)
+
roi_start_h
);
int
wend
=
ceil
(
bin_size_w
*
static_cast
<
T
>
(
pw
+
1
)
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
min
(
max
(
hstart
,
0
),
height
);
hend
=
min
(
max
(
hend
,
0
),
height
);
wstart
=
min
(
max
(
wstart
,
0
),
width
);
wend
=
min
(
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
// Accumulate diff_val into input data
T
bin_area
=
static_cast
<
T
>
((
hend
-
hstart
)
*
(
wend
-
wstart
));
T
diff_val
=
is_empty
?
0.
:
output_grad_data
[
i
]
/
bin_area
;
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
width
+
iw
;
platform
::
CudaAtomicAdd
(
offset_input_grad_data
+
input_index
,
diff_val
);
}
}
}
}
template
<
typename
Place
,
typename
T
>
class
GPUPSROIPoolOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
rois
=
ctx
.
Input
<
LoDTensor
>
(
"ROIs"
);
auto
*
out
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
pooled_height
=
ctx
.
Attr
<
int
>
(
"pooled_height"
);
auto
pooled_width
=
ctx
.
Attr
<
int
>
(
"pooled_width"
);
auto
output_channels
=
ctx
.
Attr
<
int
>
(
"output_channels"
);
auto
spatial_scale
=
ctx
.
Attr
<
float
>
(
"spatial_scale"
);
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
PADDLE_ENFORCE_EQ
(
input_channels
,
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
int
rois_num
=
rois
->
dims
()[
0
];
if
(
rois_num
==
0
)
return
;
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
rois_batch_size
,
batch_size
,
"The rois_batch_size and input(X) batch_size must be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_ENFORCE_EQ
(
rois_num
,
rois_num_with_lod
,
"The rois_num from input and lod must be the same."
);
// set rois batch id
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
int
*
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
framework
::
Tensor
rois_batch_id_list_gpu
;
framework
::
TensorCopy
(
rois_batch_id_list
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
&
rois_batch_id_list_gpu
);
int
output_size
=
out
->
numel
();
int
blocks
=
NumBlocks
(
output_size
);
int
threads
=
kNumCUDAThreads
;
// call cuda kernel function
GPUPSROIPoolForward
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
output_size
,
in
->
data
<
T
>
(),
rois
->
data
<
T
>
(),
spatial_scale
,
input_channels
,
height
,
width
,
output_channels
,
pooled_height
,
pooled_width
,
rois_batch_id_list_gpu
.
data
<
int
>
(),
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
};
template
<
typename
Place
,
typename
T
>
class
GPUPSROIPoolGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
rois
=
ctx
.
Input
<
LoDTensor
>
(
"ROIs"
);
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
pooled_height
=
ctx
.
Attr
<
int
>
(
"pooled_height"
);
auto
pooled_width
=
ctx
.
Attr
<
int
>
(
"pooled_width"
);
auto
output_channels
=
ctx
.
Attr
<
int
>
(
"output_channels"
);
auto
spatial_scale
=
ctx
.
Attr
<
float
>
(
"spatial_scale"
);
int
rois_num
=
rois
->
dims
()[
0
];
int
input_channels
=
in
->
dims
()[
1
];
int
height
=
in
->
dims
()[
2
];
int
width
=
in
->
dims
()[
3
];
if
(
input_grad
)
{
// set roi batch id
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
int
*
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
(
platform
::
CPUPlace
());
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
framework
::
Tensor
rois_batch_id_list_gpu
;
framework
::
TensorCopy
(
rois_batch_id_list
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
&
rois_batch_id_list_gpu
);
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
math
::
SetConstant
<
Place
,
T
>
set_zero
;
set_zero
(
ctx
.
cuda_device_context
(),
input_grad
,
static_cast
<
T
>
(
0
));
int
output_grad_size
=
output_grad
->
numel
();
int
blocks
=
NumBlocks
(
output_grad_size
);
int
threads
=
kNumCUDAThreads
;
if
(
output_grad_size
>
0
)
{
GPUPSROIPoolBackward
<
T
><<<
blocks
,
threads
,
0
,
ctx
.
cuda_device_context
().
stream
()
>>>
(
output_grad_size
,
rois
->
data
<
T
>
(),
output_grad
->
data
<
T
>
(),
spatial_scale
,
input_channels
,
height
,
width
,
output_channels
,
pooled_height
,
pooled_width
,
rois_batch_id_list_gpu
.
data
<
int
>
(),
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
psroi_pool
,
ops
::
GPUPSROIPoolOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GPUPSROIPoolOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
REGISTER_OP_CUDA_KERNEL
(
psroi_pool_grad
,
ops
::
GPUPSROIPoolGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
GPUPSROIPoolGradOpKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
);
paddle/fluid/operators/psroi_pool_op.h
0 → 100644
浏览文件 @
740e1626
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
CPUPSROIPoolOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
rois
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"ROIs"
);
auto
*
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
pooled_height
=
ctx
.
Attr
<
int
>
(
"pooled_height"
);
auto
pooled_width
=
ctx
.
Attr
<
int
>
(
"pooled_width"
);
auto
spatial_scale
=
ctx
.
Attr
<
float
>
(
"spatial_scale"
);
auto
output_channels
=
ctx
.
Attr
<
int
>
(
"output_channels"
);
auto
in_dims
=
in
->
dims
();
int
batch_size
=
in_dims
[
0
];
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
auto
in_stride
=
framework
::
stride
(
in_dims
);
auto
roi_stride
=
framework
::
stride
(
rois
->
dims
());
auto
out_stride
=
framework
::
stride
(
out
->
dims
());
const
T
*
input_data
=
in
->
data
<
T
>
();
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
int
*
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
PADDLE_ENFORCE_EQ
(
rois_batch_size
,
batch_size
,
"the rois_batch_size and input(X) batch_size should be the same."
);
int
rois_num_with_lod
=
rois_lod
[
rois_batch_size
];
PADDLE_ENFORCE_EQ
(
rois_num_with_lod
,
rois_num
,
"the rois_num from input and lod must be the same"
);
PADDLE_ENFORCE_EQ
(
input_channels
,
output_channels
*
pooled_height
*
pooled_width
,
"the channels of input X should equal the product of "
"output_channels x pooled_height x pooled_width"
);
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
T
*
output_data
=
out
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
const
T
*
input_rois
=
rois
->
data
<
T
>
();
// calculate psroipooling, parallel processing can be implemented per ROI
for
(
int
n
=
0
;
n
<
rois_num
;
++
n
)
{
// set roi batch id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
// [start, end) interval for spatial sampling
const
T
*
offset_input_rois
=
input_rois
+
n
*
4
;
T
roi_start_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
T
roi_start_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
T
roi_end_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
T
roi_end_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small rois to be 1 x 1
T
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
(
T
)
0.1
);
// avoid 0
T
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
(
T
)
0.1
);
// Compute bin size w and h at input feature map
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
// calculate each pixel of the output feature map.
int
out_roi_offset
=
n
*
out_stride
[
0
];
for
(
int
c
=
0
;
c
<
output_channels
;
++
c
)
{
// per category
int
out_plane_offset
=
out_roi_offset
+
c
*
out_stride
[
1
];
for
(
int
ph
=
0
;
ph
<
pooled_height
;
++
ph
)
{
int
out_row_offset
=
out_plane_offset
+
ph
*
out_stride
[
2
];
for
(
int
pw
=
0
;
pw
<
pooled_width
;
++
pw
)
{
// calculate w and h at input feature map
int
hstart
=
floor
(
static_cast
<
T
>
(
ph
)
*
bin_size_h
+
roi_start_h
);
int
wstart
=
floor
(
static_cast
<
T
>
(
pw
)
*
bin_size_w
+
roi_start_w
);
int
hend
=
ceil
(
static_cast
<
T
>
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
);
int
wend
=
ceil
(
static_cast
<
T
>
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
int
output_index
=
out_row_offset
+
pw
;
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
int
input_plane_offset
=
roi_batch_id
*
in_stride
[
0
]
+
input_channel
*
in_stride
[
1
];
const
T
*
offset_input_data
=
input_data
+
input_plane_offset
;
T
out_sum
=
0.
;
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
in_stride
[
2
]
+
iw
;
out_sum
+=
offset_input_data
[
input_index
];
}
}
T
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
output_data
[
output_index
]
=
is_empty
?
0.
:
out_sum
/
bin_area
;
}
}
}
}
return
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
CPUPSROIPoolGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
*
rois
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"ROIs"
);
auto
*
output_grad
=
ctx
.
Input
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
input_grad
=
ctx
.
Output
<
framework
::
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
pooled_height
=
ctx
.
Attr
<
int
>
(
"pooled_height"
);
auto
pooled_width
=
ctx
.
Attr
<
int
>
(
"pooled_width"
);
auto
output_channels
=
ctx
.
Attr
<
int
>
(
"output_channels"
);
auto
spatial_scale
=
ctx
.
Attr
<
float
>
(
"spatial_scale"
);
if
(
input_grad
)
{
auto
in_dims
=
in
->
dims
();
int
input_channels
=
in_dims
[
1
];
int
height
=
in_dims
[
2
];
int
width
=
in_dims
[
3
];
int
rois_num
=
rois
->
dims
()[
0
];
// set roi batch id
framework
::
Tensor
rois_batch_id_list
;
rois_batch_id_list
.
Resize
({
rois_num
});
int
*
rois_batch_id_data
=
rois_batch_id_list
.
mutable_data
<
int
>
(
ctx
.
GetPlace
());
auto
rois_lod
=
rois
->
lod
().
back
();
int
rois_batch_size
=
rois_lod
.
size
()
-
1
;
// calculate batch id index for each roi according to LoD
for
(
int
n
=
0
;
n
<
rois_batch_size
;
++
n
)
{
for
(
size_t
i
=
rois_lod
[
n
];
i
<
rois_lod
[
n
+
1
];
++
i
)
{
rois_batch_id_data
[
i
]
=
n
;
}
}
const
T
*
input_rois
=
rois
->
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// set gradient of X to be 0. before backpropagate.
math
::
SetConstant
<
DeviceContext
,
T
>
set_zero
;
set_zero
(
ctx
.
template
device_context
<
DeviceContext
>(),
input_grad
,
static_cast
<
T
>
(
0
));
// backpropagate gradient per output pixel
int
output_grad_size
=
output_grad
->
numel
();
for
(
int
i
=
0
;
i
<
output_grad_size
;
++
i
)
{
// The output is in order (n, c, ph, pw)
int
pw
=
i
%
pooled_width
;
int
ph
=
(
i
/
pooled_width
)
%
pooled_height
;
int
c
=
(
i
/
pooled_width
/
pooled_height
)
%
output_channels
;
int
n
=
i
/
pooled_width
/
pooled_height
/
output_channels
;
// set roi_batch_id
int
roi_batch_id
=
rois_batch_id_data
[
n
];
int
input_channel
=
(
c
*
pooled_height
+
ph
)
*
pooled_width
+
pw
;
int
input_offset
=
(
roi_batch_id
*
input_channels
+
input_channel
)
*
height
*
width
;
T
*
offset_input_grad_data
=
input_grad_data
+
input_offset
;
// [start, end) interval for spatial sampling
const
T
*
offset_input_rois
=
input_rois
+
n
*
4
;
T
roi_start_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
0
]))
*
spatial_scale
;
T
roi_start_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
1
]))
*
spatial_scale
;
T
roi_end_w
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
2
])
+
1.
)
*
spatial_scale
;
T
roi_end_h
=
static_cast
<
T
>
(
round
(
offset_input_rois
[
3
])
+
1.
)
*
spatial_scale
;
// Force too small ROIs to be 1x1
T
roi_height
=
std
::
max
(
roi_end_h
-
roi_start_h
,
(
T
)
0.1
);
// avoid 0
T
roi_width
=
std
::
max
(
roi_end_w
-
roi_start_w
,
(
T
)
0.1
);
// Compute w and h at input feature map
T
bin_size_h
=
roi_height
/
static_cast
<
T
>
(
pooled_height
);
T
bin_size_w
=
roi_width
/
static_cast
<
T
>
(
pooled_width
);
int
hstart
=
floor
(
bin_size_h
*
static_cast
<
T
>
(
ph
)
+
roi_start_h
);
int
wstart
=
floor
(
bin_size_w
*
static_cast
<
T
>
(
pw
)
+
roi_start_w
);
int
hend
=
ceil
(
bin_size_h
*
static_cast
<
T
>
(
ph
+
1
)
+
roi_start_h
);
int
wend
=
ceil
(
bin_size_w
*
static_cast
<
T
>
(
pw
+
1
)
+
roi_start_w
);
// Add roi offsets and clip to input boundaries
hstart
=
std
::
min
(
std
::
max
(
hstart
,
0
),
height
);
hend
=
std
::
min
(
std
::
max
(
hend
,
0
),
height
);
wstart
=
std
::
min
(
std
::
max
(
wstart
,
0
),
width
);
wend
=
std
::
min
(
std
::
max
(
wend
,
0
),
width
);
bool
is_empty
=
(
hend
<=
hstart
)
||
(
wend
<=
wstart
);
// Accumulate diff_val into input data
T
bin_area
=
static_cast
<
T
>
((
hend
-
hstart
)
*
(
wend
-
wstart
));
T
diff_val
=
is_empty
?
0.
:
output_grad_data
[
i
]
/
bin_area
;
for
(
int
ih
=
hstart
;
ih
<
hend
;
++
ih
)
{
for
(
int
iw
=
wstart
;
iw
<
wend
;
++
iw
)
{
int
input_index
=
ih
*
width
+
iw
;
offset_input_grad_data
[
input_index
]
+=
diff_val
;
}
}
}
}
return
;
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reader/ctr_reader.h
浏览文件 @
740e1626
...
...
@@ -16,6 +16,7 @@
#include <sys/time.h>
#include <algorithm>
#include <chrono> // NOLINT
#include <cstdlib>
#include <fstream>
...
...
@@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader {
PADDLE_ENFORCE_GT
(
thread_num
,
0
,
"thread num should be larger then 0!"
);
PADDLE_ENFORCE
(
queue
!=
nullptr
,
"LoDTensorBlockingQueue must not be null"
);
PADDLE_ENFORCE_GT
(
file_list
.
size
(),
0
,
"file list should not be empty"
);
thread_num_
=
file_list_
.
size
()
>
thread_num
?
thread_num
:
file_list_
.
size
();
thread_num_
=
std
::
min
<
size_t
>
(
file_list_
.
size
(),
thread_num
);
queue_
=
queue
;
SplitFiles
();
for
(
size_t
i
=
0
;
i
<
thread_num_
;
++
i
)
{
...
...
@@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader {
queue_
->
ReOpen
();
VLOG
(
3
)
<<
"reopen success"
;
VLOG
(
3
)
<<
"thread_num "
<<
thread_num_
;
for
(
in
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
slots_
,
batch_size_
,
thread_id
,
&
read_thread_status_
,
queue_
)));
for
(
size_
t
thread_id
=
0
;
thread_id
<
thread_num_
;
thread_id
++
)
{
read_threads_
.
emplace_back
(
new
std
::
thread
(
std
::
bind
(
&
ReadThread
,
file_groups_
[
thread_id
],
slots_
,
batch_size_
,
static_cast
<
int
>
(
thread_id
)
,
&
read_thread_status_
,
queue_
)));
}
monitor_thread_
.
reset
(
new
std
::
thread
(
std
::
bind
(
&
MonitorThread
,
&
read_thread_status_
,
queue_
)));
...
...
paddle/fluid/operators/spp_op.h
浏览文件 @
740e1626
...
...
@@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel<T> {
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
MaxPool
<
T
>
,
T
>
pool_forward
;
math
::
MaxPool
<
T
>
max_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
max_process
,
true
,
kernel_size
,
strides
,
paddings
,
max_process
,
true
,
false
,
&
out_level
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
AvgPool
<
T
>
,
T
>
pool_forward
;
math
::
AvgPool
<
T
>
avg_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
false
,
&
out_level
);
}
// flatten pooling output shape
...
...
@@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
math
::
AvgPoolGrad
<
T
>
avg_process
;
pool_backward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
*&
out_level
,
*&
outgrad_level
,
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
in_x_grad
);
paddings
,
avg_process
,
true
,
false
,
in_x_grad
);
}
}
}
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -56,9 +56,16 @@ ELSE()
set
(
MKLDNN_CTX_DEPS
)
ENDIF
()
nv_library
(
stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce
)
IF
(
WITH_GPU
)
set
(
STREAM_CALLBACK_DEPS stream_callback_manager
)
ELSE
()
set
(
STREAM_CALLBACK_DEPS
)
ENDIF
()
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library
(
device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
cc_library
(
device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
${
STREAM_CALLBACK_DEPS
}
place eigen3 stringpiece cpu_helper cpu_info framework_proto
${
GPU_CTX_DEPS
}
${
MKLDNN_CTX_DEPS
}
)
nv_test
(
device_context_test SRCS device_context_test.cu DEPS device_context gpu_info
)
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
740e1626
...
...
@@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
740e1626
...
...
@@ -222,14 +222,10 @@ class CUDADeviceContext : public DeviceContext {
template
<
typename
Callback
>
void
AddStreamCallback
(
Callback
&&
callback
)
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
callback_mtx_
);
callback_manager_
->
AddCallback
(
callback
);
}
void
WaitStreamCallback
()
const
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
callback_mtx_
);
callback_manager_
->
Wait
();
}
void
WaitStreamCallback
()
const
{
callback_manager_
->
Wait
();
}
#if CUDA_VERSION >= 9000
/*! \brief CublasCall may need to change cublas's config,
...
...
@@ -260,9 +256,7 @@ class CUDADeviceContext : public DeviceContext {
mutable
std
::
mutex
mtx_
;
// This lock is only used by callback
// If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
mutable
std
::
mutex
callback_mtx_
;
// StreamCallbackManager is thread-safe
std
::
unique_ptr
<
StreamCallbackManager
>
callback_manager_
;
mutable
std
::
mutex
cublas_mtx_
;
...
...
paddle/fluid/platform/stream_callback_manager.cc
0 → 100644
浏览文件 @
740e1626
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/stream_callback_manager.h"
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
#if CUDA_VERSION >= 10000
static
void
CUDART_CB
StreamCallbackFunc
(
void
*
user_data
);
#else
static
void
CUDART_CB
StreamCallbackFunc
(
cudaStream_t
stream
,
cudaError_t
status
,
void
*
user_data
)
#endif
{
std
::
unique_ptr
<
std
::
function
<
void
()
>>
func
(
reinterpret_cast
<
std
::
function
<
void
()
>
*>
(
user_data
));
(
*
func
)();
}
StreamCallbackManager
::
StreamCallbackManager
(
const
cudaStream_t
stream
)
:
stream_
(
stream
),
thread_pool_
(
1
)
{}
void
StreamCallbackManager
::
AddCallback
(
std
::
function
<
void
()
>
callback
)
const
{
auto
*
callback_func
=
new
std
::
function
<
void
()
>
(
std
::
move
(
callback
));
auto
*
func
=
new
std
::
function
<
void
()
>
([
this
,
callback_func
]
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
last_future_
=
thread_pool_
.
enqueue
([
callback_func
]
{
std
::
unique_ptr
<
std
::
function
<
void
()
>>
releaser
(
callback_func
);
(
*
callback_func
)();
});
});
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE
(
cudaLaunchHostFunc
(
stream_
,
StreamCallbackFunc
,
func
));
#else
PADDLE_ENFORCE
(
cudaStreamAddCallback
(
stream_
,
StreamCallbackFunc
,
func
,
0
));
#endif
}
void
StreamCallbackManager
::
Wait
()
const
{
PADDLE_ENFORCE
(
cudaStreamSynchronize
(
stream_
));
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
if
(
last_future_
.
valid
())
{
last_future_
.
wait
();
}
}
}
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/stream_callback_manager.h
浏览文件 @
740e1626
...
...
@@ -18,67 +18,32 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include "paddle/fluid/platform/enforce.h"
namespace
paddle
{
namespace
platform
{
class
StreamCallbackManager
;
struct
StreamCallbackContext
{
template
<
typename
Callback
>
inline
StreamCallbackContext
(
const
StreamCallbackManager
*
manager
,
Callback
&&
callback
)
:
manager_
(
manager
),
callback_
(
callback
)
{}
const
StreamCallbackManager
*
manager_
;
// do not own
std
::
function
<
void
()
>
callback_
;
};
// NOTE(zjl): clean StreamCallbackManager to make compilation faster
// Make StreamCallbackManager thread-safe
class
StreamCallbackManager
{
public:
explicit
inline
StreamCallbackManager
(
cudaStream_t
stream
=
nullptr
)
:
stream_
(
stream
),
thread_pool_
(
new
ThreadPool
(
1
))
{}
explicit
StreamCallbackManager
(
const
cudaStream_t
stream
);
~
StreamCallbackManager
()
=
default
;
template
<
typename
Callback
>
inline
void
AddCallback
(
Callback
&&
callback
)
const
{
auto
*
stream_callback_context
=
new
StreamCallbackContext
(
this
,
std
::
forward
<
Callback
>
(
callback
));
#if CUDA_VERSION >= 10000
PADDLE_ENFORCE
(
cudaLaunchHostFunc
(
stream_
,
StreamCallbackManager
::
StreamCallbackFunc
,
stream_callback_context
));
// NOLINT
#else
PADDLE_ENFORCE
(
cudaStreamAddCallback
(
stream_
,
StreamCallbackManager
::
StreamCallbackFunc
,
stream_callback_context
,
0
));
// NOLINT
#endif
}
void
AddCallback
(
std
::
function
<
void
()
>
callback
)
const
;
void
Wait
()
const
{
thread_pool_
.
reset
(
new
ThreadPool
(
1
));
}
void
Wait
()
const
;
private:
const
cudaStream_t
stream_
;
mutable
std
::
unique_ptr
<
ThreadPool
>
thread_pool_
;
// cudaStreamCallback cannot call CUDA API inside, so we have to use
// thread_pool here
#if CUDA_VERSION >= 10000
static
void
CUDART_CB
StreamCallbackFunc
(
void
*
user_data
)
#else
static
void
CUDART_CB
StreamCallbackFunc
(
cudaStream_t
stream
,
cudaError_t
status
,
void
*
user_data
)
#endif
{
auto
*
callback_context_ptr
=
reinterpret_cast
<
StreamCallbackContext
*>
(
user_data
);
callback_context_ptr
->
manager_
->
thread_pool_
->
enqueue
([
=
]()
{
std
::
unique_ptr
<
StreamCallbackContext
>
callback_context
(
callback_context_ptr
);
callback_context
->
callback_
();
});
}
mutable
::
ThreadPool
thread_pool_
;
mutable
std
::
mutex
mtx_
;
mutable
std
::
future
<
void
>
last_future_
;
};
}
// namespace platform
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
740e1626
...
...
@@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() {
#endif
}
bool
IsCompiledWithBrpc
()
{
#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
return
true
;
#else
return
false
;
#endif
}
bool
IsCompiledWithDIST
()
{
#ifdef PADDLE_WITH_DISTRIBUTE
return
true
;
...
...
@@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.
[](
bool
init_p2p
)
{
framework
::
InitDevices
(
init_p2p
);
});
m
.
def
(
"is_compiled_with_cuda"
,
IsCompiledWithCUDA
);
m
.
def
(
"is_compiled_with_brpc"
,
IsCompiledWithBrpc
);
m
.
def
(
"is_compiled_with_dist"
,
IsCompiledWithDIST
);
#ifdef PADDLE_WITH_CUDA
m
.
def
(
"is_float16_supported"
,
[](
const
platform
::
CUDAPlace
&
place
)
->
bool
{
...
...
paddle/fluid/pybind/tensor_py.h
浏览文件 @
740e1626
...
...
@@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray(
paddle
::
platform
::
CPUPlace
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
size_t
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
paddle
::
platform
::
CPUPlace
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
int
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
@@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray(
paddle
::
platform
::
CUDAPlace
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
size_t
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
@@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray(
paddle
::
platform
::
CUDAPlace
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
size_t
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
@@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray(
const
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
size_t
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
@@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray(
const
paddle
::
platform
::
CUDAPinnedPlace
&
place
)
{
std
::
vector
<
int64_t
>
dims
;
dims
.
reserve
(
array
.
ndim
());
for
(
size_t
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
for
(
decltype
(
array
.
ndim
())
i
=
0
;
i
<
array
.
ndim
();
++
i
)
{
dims
.
push_back
(
static_cast
<
int
>
(
array
.
shape
()[
i
]));
}
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
740e1626
...
...
@@ -517,6 +517,18 @@ function assert_api_spec_approvals() {
fi
fi
done
HAS_CONST_CAST
=
`
git diff
-U0
upstream/
$BRANCH
|grep
-o
-m
1
"const_cast"
||
true
`
if
[
${
HAS_CONST_CAST
}
]
&&
[
"
${
GIT_PR_ID
}
"
!=
""
]
;
then
APPROVALS
=
`
curl
-H
"Authorization: token
${
GITHUB_API_TOKEN
}
"
https://api.github.com/repos/PaddlePaddle/Paddle/pulls/
${
GIT_PR_ID
}
/reviews?per_page
=
10000 |
\
python
${
PADDLE_ROOT
}
/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433
`
echo
"current pr
${
GIT_PR_ID
}
got approvals:
${
APPROVALS
}
"
if
[
"
${
APPROVALS
}
"
==
"FALSE"
]
;
then
echo
"You must have at least 2 approvals for the const_cast"
exit
1
fi
fi
}
...
...
python/paddle/fluid/__init__.py
浏览文件 @
740e1626
...
...
@@ -126,9 +126,9 @@ def __bootstrap__():
'check_nan_inf'
,
'benchmark'
,
'eager_delete_scope'
,
'use_mkldnn'
,
'use_ngraph'
,
'initial_cpu_memory_in_mb'
,
'init_allocated_mem'
,
'free_idle_memory'
,
'paddle_num_threads'
,
"dist_threadpool_size"
,
'eager_delete_tensor_gb'
,
'
allocator_strategy
'
,
'
reader_queue_speed_test_mode'
,
'print_sub_graph_dir
'
,
'pe_profile_fname'
'eager_delete_tensor_gb'
,
'
fast_eager_deletion_mode
'
,
'
allocator_strategy'
,
'reader_queue_speed_test_mode
'
,
'p
rint_sub_graph_dir'
,
'p
e_profile_fname'
]
if
'Darwin'
not
in
sysstr
:
read_env_flags
.
append
(
'use_pinned_memory'
)
...
...
@@ -152,6 +152,7 @@ def __bootstrap__():
'enable_cublas_tensor_op_math'
,
'conv_workspace_size_limit'
,
'cudnn_exhaustive_search'
,
'selected_gpus'
]
core
.
init_gflags
([
sys
.
argv
[
0
]]
+
[
"--tryfromenv="
+
","
.
join
(
read_env_flags
)])
core
.
init_glog
(
sys
.
argv
[
0
])
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
740e1626
...
...
@@ -52,6 +52,8 @@ __all__ = [
'softmax'
,
'pool2d'
,
'pool3d'
,
'adaptive_pool2d'
,
'adaptive_pool3d'
,
'batch_norm'
,
'beam_search_decode'
,
'conv2d_transpose'
,
...
...
@@ -173,6 +175,7 @@ __all__ = [
'merge_selected_rows'
,
'get_tensor_from_selected_rows'
,
'lstm'
,
'psroi_pool'
,
]
kIgnoreIndex
=
-
100
...
...
@@ -2499,6 +2502,204 @@ def pool3d(input,
return
pool_out
@
templatedoc
(
op_type
=
"pool2d"
)
def
adaptive_pool2d
(
input
,
pool_size
,
pool_type
=
"max"
,
require_index
=
False
,
name
=
None
):
"""
${comment}
Args:
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (pool_size_Height, pool_size_Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs.
it cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The pooling result.
Raises:
ValueError: 'pool_type' is not 'max' nor 'avg'.
ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
ValueError: 'pool_size' should be a list or tuple with length as 2.
Examples:
.. code-block:: python
# suppose input data in shape of [N, C, H, W], `pool_size` is [m, n],
# output shape is [N, C, m, n], adaptive pool divide H and W dimentions
# of input data into m * n grids averagely and performs poolings in each
# grid to get output.
# adaptive average pool performs calculations as follow:
#
# for i in range(m):
# for j in range(n):
# hstart = floor(i * H / m)
# hend = ceil((i + 1) * H / m)
# wstart = floor(i * W / n)
# wend = ceil((i + 1) * W / n)
# output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
#
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
pool_out = fluid.layers.adaptive_pool2d(
input=data,
pool_size=[3, 3],
pool_type='avg')
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
"Unknown pool_type: '%s'. It can only be 'max' or 'avg'."
,
str
(
pool_type
))
if
pool_type
==
"avg"
and
require_index
:
raise
ValueError
(
"invalid setting 'require_index' true when 'pool_type' is 'avg'."
)
def
_is_list_or_tuple_
(
data
):
return
(
isinstance
(
data
,
list
)
or
isinstance
(
data
,
tuple
))
if
not
_is_list_or_tuple_
(
pool_size
)
or
len
(
pool_size
)
!=
2
:
raise
ValueError
(
"'pool_size' should be a list or tuple with length as 2."
)
if
pool_type
==
"max"
:
l_type
=
'max_pool2d_with_index'
else
:
l_type
=
"pool2d"
helper
=
LayerHelper
(
l_type
,
**
locals
())
dtype
=
helper
.
input_dtype
()
pool_out
=
helper
.
create_variable_for_type_inference
(
dtype
)
outputs
=
{
"Out"
:
pool_out
}
if
pool_type
==
"max"
:
mask
=
helper
.
create_variable_for_type_inference
(
dtype
)
outputs
[
"Mask"
]
=
mask
helper
.
append_op
(
type
=
l_type
,
inputs
=
{
"X"
:
input
},
outputs
=
outputs
,
attrs
=
{
"pooling_type"
:
pool_type
,
"ksize"
:
pool_size
,
"adaptive"
:
True
,
})
return
(
pool_out
,
mask
)
if
require_index
else
pool_out
@
templatedoc
(
op_type
=
"pool3d"
)
def
adaptive_pool3d
(
input
,
pool_size
,
pool_type
=
"max"
,
require_index
=
False
,
name
=
None
):
"""
${comment}
Args:
input (Variable): The input tensor of pooling operator. The format of
input tensor is NCHW, where N is batch size, C is
the number of channels, H is the height of the
feature, and W is the width of the feature.
pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
it must contain two integers, (Depth, Height, Width).
pool_type: ${pooling_type_comment}
require_index (bool): If true, the index of max pooling point along with outputs.
it cannot be set in average pooling type.
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
Returns:
Variable: The pooling result.
Raises:
ValueError: 'pool_type' is not 'max' nor 'avg'.
ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
ValueError: 'pool_size' should be a list or tuple with length as 2.
Examples:
.. code-block:: python
# suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
# output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
# of input data into l * m * n grids averagely and performs poolings in each
# grid to get output.
# adaptive average pool performs calculations as follow:
#
# for i in range(l):
# for j in range(m):
# for k in range(n):
# dstart = floor(i * D / l)
# dend = ceil((i + 1) * D / l)
# hstart = floor(j * H / m)
# hend = ceil((j + 1) * H / m)
# wstart = floor(k * W / n)
# wend = ceil((k + 1) * W / n)
# output[:, :, i, j, k] =
# avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
#
data = fluid.layers.data(
name='data', shape=[3, 32, 32], dtype='float32')
pool_out, mask = fluid.layers.adaptive_pool3d(
input=data,
pool_size=[3, 3],
pool_type='avg')
"""
if
pool_type
not
in
[
"max"
,
"avg"
]:
raise
ValueError
(
"Unknown pool_type: '%s'. It can only be 'max' or 'avg'."
,
str
(
pool_type
))
if
pool_type
==
"avg"
and
require_index
:
raise
ValueError
(
"invalid setting 'require_index' true when 'pool_type' is 'avg'."
)
def
_is_list_or_tuple_
(
data
):
return
(
isinstance
(
data
,
list
)
or
isinstance
(
data
,
tuple
))
if
not
_is_list_or_tuple_
(
pool_size
)
or
len
(
pool_size
)
!=
3
:
raise
ValueError
(
"'pool_size' should be a list or tuple with length as 3."
)
if
pool_type
==
"max"
:
l_type
=
'max_pool3d_with_index'
else
:
l_type
=
"pool3d"
helper
=
LayerHelper
(
l_type
,
**
locals
())
dtype
=
helper
.
input_dtype
()
pool_out
=
helper
.
create_variable_for_type_inference
(
dtype
)
outputs
=
{
"Out"
:
pool_out
}
if
pool_type
==
"max"
:
mask
=
helper
.
create_variable_for_type_inference
(
dtype
)
outputs
[
"Mask"
]
=
mask
helper
.
append_op
(
type
=
l_type
,
inputs
=
{
"X"
:
input
},
outputs
=
outputs
,
attrs
=
{
"pooling_type"
:
pool_type
,
"ksize"
:
pool_size
,
"adaptive"
:
True
,
})
return
(
pool_out
,
mask
)
if
require_index
else
pool_out
def
batch_norm
(
input
,
act
=
None
,
is_test
=
False
,
...
...
@@ -9122,3 +9323,57 @@ def get_tensor_from_selected_rows(x, name=None):
outputs
=
{
'Out'
:
out
},
attrs
=
{})
return
out
@
templatedoc
()
def
psroi_pool
(
input
,
rois
,
output_channels
,
spatial_scale
,
pooled_height
,
pooled_width
,
name
=
None
):
"""
${comment}
Args:
input (Variable): ${x_comment}
rois (Variable): ROIs (Regions of Interest) to pool over.
output_channels (integer): ${output_channels_comment}
spatial_scale (float): ${spatial_scale_comment} Default: 1.0
pooled_height (integer): ${pooled_height_comment} Default: 1
pooled_width (integer): ${pooled_width_comment} Default: 1
name (str, default None): The name of this layer.
Returns:
Variable: ${out_comment}.
Examples:
.. code-block:: python
pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7)
"""
helper
=
LayerHelper
(
'psroi_pool'
,
**
locals
())
# check attrs
if
not
isinstance
(
output_channels
,
int
):
raise
TypeError
(
"output_channels must be int type"
)
if
not
isinstance
(
spatial_scale
,
float
):
raise
TypeError
(
"spatial_scale must be float type"
)
if
not
isinstance
(
pooled_height
,
int
):
raise
TypeError
(
"pooled_height must be int type"
)
if
not
isinstance
(
pooled_width
,
int
):
raise
TypeError
(
"pooled_width must be int type"
)
dtype
=
helper
.
input_dtype
()
out
=
helper
.
create_variable_for_type_inference
(
dtype
)
helper
.
append_op
(
type
=
'psroi_pool'
,
inputs
=
{
'X'
:
input
,
'ROIs'
:
rois
},
outputs
=
{
'Out'
:
out
},
attrs
=
{
'output_channels'
:
output_channels
,
'spatial_scale'
:
spatial_scale
,
'pooled_height'
:
pooled_height
,
'pooled_width'
:
pooled_width
})
return
out
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
740e1626
...
...
@@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo
if
(
NOT APPLE
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
endif
()
if
(
WITH_NGRAPH
)
add_subdirectory
(
ngraph
)
endif
()
python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
0 → 100644
浏览文件 @
740e1626
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
foreach
(
TEST_OP
${
TEST_OPS
}
)
py_test_modules
(
${
TEST_OP
}
MODULES
${
TEST_OP
}
ENVS FLAGS_use_ngraph=true
)
endforeach
(
TEST_OP
)
python/paddle/fluid/tests/unittests/ngraph/__init__.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
740e1626
...
...
@@ -227,6 +227,7 @@ class TestDistBase(unittest.TestCase):
def
setUp
(
self
):
self
.
_trainers
=
2
self
.
_pservers
=
2
self
.
_port_set
=
set
()
self
.
_ps_endpoints
=
"127.0.0.1:%s,127.0.0.1:%s"
%
(
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
...
...
@@ -242,10 +243,18 @@ class TestDistBase(unittest.TestCase):
self
.
_after_setup_config
()
def
_find_free_port
(
self
):
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
''
,
0
))
return
s
.
getsockname
()[
1
]
while
True
:
port
=
__free_port
()
if
port
not
in
self
.
_port_set
:
self
.
_port_set
.
add
(
port
)
return
port
def
start_pserver
(
self
,
model_file
,
check_error_log
,
required_envs
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps_cmd
=
"%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
...
...
python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
'0.0'
os
.
environ
[
'CPU_NUM'
]
=
'2'
import
six
import
unittest
import
paddle
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
def
train
(
network
,
use_cuda
,
use_parallel_executor
,
batch_size
=
32
,
pass_num
=
2
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
print
(
'Skip use_cuda=True because Paddle is not compiled with cuda'
)
return
word_dict
=
paddle
.
dataset
.
imdb
.
word_dict
()
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
imdb
.
train
(
word_dict
),
batch_size
=
batch_size
)
data
=
fluid
.
layers
.
data
(
name
=
"words"
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
"int64"
)
cost
=
network
(
data
,
label
,
len
(
word_dict
))
optimizer
=
fluid
.
optimizer
.
Adagrad
(
learning_rate
=
0.2
)
optimizer
.
minimize
(
cost
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
data
,
label
],
place
=
place
)
reader
=
feeder
.
decorate_reader
(
train_reader
,
multi_devices
=
use_parallel_executor
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
if
use_parallel_executor
:
train_exe
=
fluid
.
ParallelExecutor
(
use_cuda
=
use_cuda
,
loss_name
=
cost
.
name
)
fetch_list
=
[
cost
.
name
]
else
:
train_exe
=
exe
fetch_list
=
[
cost
]
for
pass_id
in
six
.
moves
.
xrange
(
pass_num
):
batch_id
=
0
for
data
in
reader
():
train_exe
.
run
(
feed
=
data
,
fetch_list
=
fetch_list
if
batch_id
%
4
==
0
else
[])
batch_id
+=
1
if
batch_id
>
16
:
break
class
TestBase
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
net
=
None
def
test_network
(
self
):
if
self
.
net
is
None
:
return
for
use_cuda
in
[
True
,
False
]:
for
use_parallel_executor
in
[
False
,
True
]:
print
(
'network: {}, use_cuda: {}, use_parallel_executor: {}'
.
format
(
self
.
net
.
__name__
,
use_cuda
,
use_parallel_executor
))
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
with
fluid
.
scope_guard
(
core
.
Scope
()):
train
(
self
.
net
,
use_cuda
,
use_parallel_executor
)
python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
test_eager_deletion_dynamic_rnn_base
import
TestBase
import
paddle.fluid
as
fluid
def
gru_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
400.0
):
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
3
)
gru_h
=
fluid
.
layers
.
dynamic_gru
(
input
=
fc0
,
size
=
hid_dim
,
is_reverse
=
False
)
gru_max
=
fluid
.
layers
.
sequence_pool
(
input
=
gru_h
,
pool_type
=
'max'
)
gru_max_tanh
=
fluid
.
layers
.
tanh
(
gru_max
)
fc1
=
fluid
.
layers
.
fc
(
input
=
gru_max_tanh
,
size
=
hid_dim2
,
act
=
'tanh'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
return
avg_cost
class
GRUTest
(
TestBase
):
def
setUp
(
self
):
self
.
net
=
gru_net
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
test_eager_deletion_dynamic_rnn_base
import
TestBase
import
paddle.fluid
as
fluid
import
unittest
def
lstm_net
(
data
,
label
,
dict_dim
,
emb_dim
=
128
,
hid_dim
=
128
,
hid_dim2
=
96
,
class_dim
=
2
,
emb_lr
=
30.0
):
emb
=
fluid
.
layers
.
embedding
(
input
=
data
,
size
=
[
dict_dim
,
emb_dim
],
param_attr
=
fluid
.
ParamAttr
(
learning_rate
=
emb_lr
))
fc0
=
fluid
.
layers
.
fc
(
input
=
emb
,
size
=
hid_dim
*
4
)
lstm_h
,
c
=
fluid
.
layers
.
dynamic_lstm
(
input
=
fc0
,
size
=
hid_dim
*
4
,
is_reverse
=
False
)
lstm_max
=
fluid
.
layers
.
sequence_pool
(
input
=
lstm_h
,
pool_type
=
'max'
)
lstm_max_tanh
=
fluid
.
layers
.
tanh
(
lstm_max
)
fc1
=
fluid
.
layers
.
fc
(
input
=
lstm_max_tanh
,
size
=
hid_dim2
,
act
=
'tanh'
)
prediction
=
fluid
.
layers
.
fc
(
input
=
fc1
,
size
=
class_dim
,
act
=
'softmax'
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prediction
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
return
avg_cost
class
LSTMTest
(
TestBase
):
def
setUp
(
self
):
self
.
net
=
lstm_net
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
from
test_parallel_executor_mnist
import
TestMNIST
class
EagerDeletionTestMNIST
(
TestMNIST
):
pass
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
from
test_parallel_executor_transformer
import
TestTransformer
class
EagerDeletionTestTransformer
(
TestTransformer
):
pass
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
740e1626
...
...
@@ -233,6 +233,29 @@ class TestBook(unittest.TestCase):
pool_stride
=
[
1
,
2
],
pool_padding
=
(
2
,
1
)))
def
test_adaptive_pool2d
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
224
,
224
],
dtype
=
'float32'
)
self
.
assertIsNotNone
(
layers
.
adaptive_pool2d
(
x
,
[
3
,
3
],
pool_type
=
'avg'
))
pool
,
mask
=
layers
.
adaptive_pool2d
(
x
,
[
3
,
3
],
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
def
test_adaptive_pool3d
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
244
,
224
,
224
],
dtype
=
'float32'
)
self
.
assertIsNotNone
(
layers
.
adaptive_pool3d
(
x
,
[
3
,
3
,
3
],
pool_type
=
'avg'
))
pool
,
mask
=
layers
.
adaptive_pool3d
(
x
,
[
3
,
3
,
3
],
require_index
=
True
)
self
.
assertIsNotNone
(
pool
)
self
.
assertIsNotNone
(
mask
)
def
test_lstm_unit
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
@@ -511,6 +534,16 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_psroi_pool
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
"x"
,
shape
=
[
245
,
30
,
30
],
dtype
=
"float32"
)
rois
=
layers
.
data
(
name
=
"rois"
,
shape
=
[
4
],
dtype
=
"float32"
,
lod_level
=
1
)
output
=
layers
.
psroi_pool
(
x
,
rois
,
5
,
0.25
,
7
,
7
)
self
.
assertIsNotNone
(
output
)
print
(
str
(
program
))
def
test_roi_align
(
self
):
program
=
Program
()
with
program_guard
(
program
):
...
...
python/paddle/fluid/tests/unittests/test_pool2d_op.py
浏览文件 @
740e1626
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
division
import
unittest
import
numpy
as
np
...
...
@@ -21,16 +22,28 @@ import paddle.fluid.core as core
from
op_test
import
OpTest
def
adaptive_start_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
floor
(
index
*
input_size
/
output_size
))
def
adaptive_end_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
ceil
((
index
+
1
)
*
input_size
/
output_size
))
def
max_pool2D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
,
exclusive
=
True
):
exclusive
=
True
,
adaptive
=
False
):
N
,
C
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
H
,
W
]
if
adaptive
:
H_out
,
W_out
=
ksize
else
:
H_out
=
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
//
strides
[
0
]
+
1
if
ceil_mode
else
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
...
...
@@ -40,6 +53,12 @@ def max_pool2D_forward_naive(x,
out
=
np
.
zeros
((
N
,
C
,
H_out
,
W_out
))
for
i
in
range
(
H_out
):
for
j
in
range
(
W_out
):
if
adaptive
:
r_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
0
])
r_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
0
])
c_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
1
])
c_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
1
])
else
:
r_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
r_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
c_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
...
...
@@ -56,10 +75,14 @@ def avg_pool2D_forward_naive(x,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
,
exclusive
=
True
):
exclusive
=
True
,
adaptive
=
False
):
N
,
C
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
H
,
W
]
if
adaptive
:
H_out
,
W_out
=
ksize
else
:
H_out
=
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
//
strides
[
0
]
+
1
if
ceil_mode
else
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
...
...
@@ -69,14 +92,20 @@ def avg_pool2D_forward_naive(x,
out
=
np
.
zeros
((
N
,
C
,
H_out
,
W_out
))
for
i
in
range
(
H_out
):
for
j
in
range
(
W_out
):
if
adaptive
:
r_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
0
])
r_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
0
])
c_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
1
])
c_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
1
])
else
:
r_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
r_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
c_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
c_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
x_masked
=
x
[:,
:,
r_start
:
r_end
,
c_start
:
c_end
]
field_size
=
((
r_end
-
r_start
)
*
(
c_end
-
c_start
))
if
exclusive
\
else
(
ksize
[
0
]
*
ksize
[
1
])
field_size
=
((
r_end
-
r_start
)
*
(
c_end
-
c_start
))
\
if
(
exclusive
or
adaptive
)
else
(
ksize
[
0
]
*
ksize
[
1
])
out
[:,
:,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
))
/
field_size
return
out
...
...
@@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest):
self
.
init_pool_type
()
self
.
init_ceil_mode
()
self
.
init_exclusive
()
self
.
init_adaptive
()
if
self
.
global_pool
:
self
.
paddings
=
[
0
for
_
in
range
(
len
(
self
.
paddings
))]
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
output
=
self
.
pool2D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mode
,
self
.
exclusive
).
astype
(
self
.
dtype
)
self
.
ceil_mode
,
self
.
exclusive
,
self
.
adaptive
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
attrs
=
{
...
...
@@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest):
'ceil_mode'
:
self
.
ceil_mode
,
'data_format'
:
'AnyLayout'
,
# TODO(dzhwinter) : should be fix latter
'exclusive'
:
self
.
exclusive
'exclusive'
:
self
.
exclusive
,
'adaptive'
:
self
.
adaptive
}
self
.
outputs
=
{
'Out'
:
output
}
...
...
@@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest):
def
init_exclusive
(
self
):
self
.
exclusive
=
True
def
init_adaptive
(
self
):
self
.
adaptive
=
False
class
TestCase1
(
TestPool2D_Op
):
def
init_test_case
(
self
):
...
...
@@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2):
self
.
exclusive
=
False
class
TestAvgPoolAdaptive
(
TestCase1
):
def
init_adaptive
(
self
):
self
.
adaptive
=
True
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_pool3d_op.py
浏览文件 @
740e1626
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
division
import
unittest
import
numpy
as
np
...
...
@@ -21,16 +22,28 @@ import paddle.fluid.core as core
from
op_test
import
OpTest
def
adaptive_start_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
floor
(
index
*
input_size
/
output_size
))
def
adaptive_end_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
ceil
((
index
+
1
)
*
input_size
/
output_size
))
def
max_pool3D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
,
exclusive
=
True
):
exclusive
=
True
,
adaptive
=
False
):
N
,
C
,
D
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
D
,
H
,
W
]
if
adaptive
:
D_out
,
H_out
,
W_out
=
ksize
else
:
D_out
=
(
D
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
//
strides
[
0
]
+
1
if
ceil_mode
else
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
...
...
@@ -42,14 +55,26 @@ def max_pool3D_forward_naive(x,
W
-
ksize
[
2
]
+
2
*
paddings
[
2
])
//
strides
[
2
]
+
1
out
=
np
.
zeros
((
N
,
C
,
D_out
,
H_out
,
W_out
))
for
k
in
range
(
D_out
):
if
adaptive
:
d_start
=
adaptive_start_index
(
k
,
D
,
ksize
[
0
])
d_end
=
adaptive_end_index
(
k
,
D
,
ksize
[
0
])
else
:
d_start
=
np
.
max
((
k
*
strides
[
0
]
-
paddings
[
0
],
0
))
d_end
=
np
.
min
((
k
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
D
))
for
i
in
range
(
H_out
):
h_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
if
adaptive
:
h_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
1
])
h_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
1
])
else
:
h_start
=
np
.
max
((
i
*
strides
[
1
]
-
paddings
[
1
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
H
))
for
j
in
range
(
W_out
):
w_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
if
adaptive
:
w_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
2
])
w_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
2
])
else
:
w_start
=
np
.
max
((
j
*
strides
[
2
]
-
paddings
[
2
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
2
]
+
ksize
[
2
]
-
paddings
[
2
],
W
))
x_masked
=
x
[:,
:,
d_start
:
d_end
,
h_start
:
h_end
,
w_start
:
w_end
]
out
[:,
:,
k
,
i
,
j
]
=
np
.
max
(
x_masked
,
axis
=
(
2
,
3
,
4
))
...
...
@@ -62,10 +87,14 @@ def avg_pool3D_forward_naive(x,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
,
exclusive
=
True
):
exclusive
=
True
,
adaptive
=
False
):
N
,
C
,
D
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
D
,
H
,
W
]
if
adaptive
:
D_out
,
H_out
,
W_out
=
ksize
else
:
D_out
=
(
D
-
ksize
[
0
]
+
2
*
paddings
[
0
]
+
strides
[
0
]
-
1
)
//
strides
[
0
]
+
1
if
ceil_mode
else
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
...
...
@@ -77,18 +106,30 @@ def avg_pool3D_forward_naive(x,
W
-
ksize
[
2
]
+
2
*
paddings
[
2
])
//
strides
[
2
]
+
1
out
=
np
.
zeros
((
N
,
C
,
D_out
,
H_out
,
W_out
))
for
k
in
range
(
D_out
):
if
adaptive
:
d_start
=
adaptive_start_index
(
k
,
D
,
ksize
[
0
])
d_end
=
adaptive_end_index
(
k
,
D
,
ksize
[
0
])
else
:
d_start
=
np
.
max
((
k
*
strides
[
0
]
-
paddings
[
0
],
0
))
d_end
=
np
.
min
((
k
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
D
))
for
i
in
range
(
H_out
):
h_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
if
adaptive
:
h_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
1
])
h_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
1
])
else
:
h_start
=
np
.
max
((
i
*
strides
[
1
]
-
paddings
[
1
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
H
))
for
j
in
range
(
W_out
):
w_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
if
adaptive
:
w_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
2
])
w_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
2
])
else
:
w_start
=
np
.
max
((
j
*
strides
[
2
]
-
paddings
[
2
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
2
]
+
ksize
[
2
]
-
paddings
[
2
],
W
))
x_masked
=
x
[:,
:,
d_start
:
d_end
,
h_start
:
h_end
,
w_start
:
w_end
]
field_size
=
(
d_end
-
d_start
)
*
(
h_end
-
h_start
)
*
(
w_end
-
w_start
)
\
if
exclusive
else
ksize
[
0
]
*
ksize
[
1
]
*
ksize
[
2
]
if
(
exclusive
or
adaptive
)
else
ksize
[
0
]
*
ksize
[
1
]
*
ksize
[
2
]
out
[:,
:,
k
,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
,
4
))
/
field_size
return
out
...
...
@@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest):
self
.
init_pool_type
()
self
.
init_ceil_mode
()
self
.
init_exclusive
()
self
.
init_adaptive
()
if
self
.
global_pool
:
self
.
paddings
=
[
0
for
_
in
range
(
len
(
self
.
paddings
))]
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
output
=
self
.
pool3D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mode
,
self
.
exclusive
).
astype
(
self
.
dtype
)
self
.
ceil_mode
,
self
.
exclusive
,
self
.
adaptive
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
attrs
=
{
...
...
@@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest):
'ceil_mode'
:
self
.
ceil_mode
,
'data_format'
:
'AnyLayout'
,
# TODO(dzhwinter) : should be fix latter
'exclusive'
:
self
.
exclusive
'exclusive'
:
self
.
exclusive
,
'adaptive'
:
self
.
adaptive
}
self
.
outputs
=
{
'Out'
:
output
}
...
...
@@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest):
def
init_exclusive
(
self
):
self
.
exclusive
=
True
def
init_adaptive
(
self
):
self
.
adaptive
=
False
class
TestCase1
(
TestPool3d_Op
):
def
init_test_case
(
self
):
...
...
@@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3):
self
.
exclusive
=
False
class
TestAvgPoolAdaptive
(
TestCase1
):
def
init_adaptive
(
self
):
self
.
adaptive
=
True
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_pool_max_op.py
浏览文件 @
740e1626
...
...
@@ -13,33 +13,62 @@
# limitations under the License.
from
__future__
import
print_function
from
__future__
import
division
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
max_pool3D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
False
):
def
adaptive_start_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
floor
(
index
*
input_size
/
output_size
))
def
adaptive_end_index
(
index
,
input_size
,
output_size
):
return
int
(
np
.
ceil
((
index
+
1
)
*
input_size
/
output_size
))
def
max_pool3D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
False
,
adaptive
=
False
):
N
,
C
,
D
,
H
,
W
=
x
.
shape
if
global_pool
:
ksize
=
[
D
,
H
,
W
]
paddings
=
[
0
,
0
,
0
]
if
adaptive
:
D_out
,
H_out
,
W_out
=
ksize
else
:
D_out
=
(
D
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
H_out
=
(
H
-
ksize
[
1
]
+
2
*
paddings
[
1
])
//
strides
[
1
]
+
1
W_out
=
(
W
-
ksize
[
2
]
+
2
*
paddings
[
2
])
//
strides
[
2
]
+
1
out
=
np
.
zeros
((
N
,
C
,
D_out
,
H_out
,
W_out
))
mask
=
np
.
zeros
((
N
,
C
,
D_out
,
H_out
,
W_out
))
for
k
in
range
(
D_out
):
if
adaptive
:
d_start
=
adaptive_start_index
(
k
,
D
,
ksize
[
0
])
d_end
=
adaptive_end_index
(
k
,
D
,
ksize
[
0
])
else
:
d_start
=
np
.
max
((
k
*
strides
[
0
]
-
paddings
[
0
],
0
))
d_end
=
np
.
min
((
k
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
D
))
for
i
in
range
(
H_out
):
h_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
if
adaptive
:
h_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
1
])
h_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
1
])
else
:
h_start
=
np
.
max
((
i
*
strides
[
1
]
-
paddings
[
1
],
0
))
h_end
=
np
.
min
((
i
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
H
))
for
j
in
range
(
W_out
):
w_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
if
adaptive
:
w_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
2
])
w_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
2
])
else
:
w_start
=
np
.
max
((
j
*
strides
[
2
]
-
paddings
[
2
],
0
))
w_end
=
np
.
min
((
j
*
strides
[
2
]
+
ksize
[
2
]
-
paddings
[
2
],
W
))
x_masked
=
x
[:,
:,
d_start
:
d_end
,
h_start
:
h_end
,
w_start
:
w_end
]
out
[:,
:,
k
,
i
,
j
]
=
np
.
max
(
x_masked
,
axis
=
(
2
,
3
,
4
))
...
...
@@ -58,19 +87,33 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
return
out
,
mask
def
max_pool2D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
False
):
def
max_pool2D_forward_naive
(
x
,
ksize
,
strides
,
paddings
,
global_pool
=
False
,
adaptive
=
False
):
N
,
C
,
H
,
W
=
x
.
shape
if
global_pool
:
ksize
=
[
H
,
W
]
paddings
=
[
0
,
0
]
if
adaptive
:
H_out
,
W_out
=
ksize
else
:
H_out
=
(
H
-
ksize
[
0
]
+
2
*
paddings
[
0
])
//
strides
[
0
]
+
1
W_out
=
(
W
-
ksize
[
1
]
+
2
*
paddings
[
1
])
//
strides
[
1
]
+
1
out
=
np
.
zeros
((
N
,
C
,
H_out
,
W_out
))
mask
=
np
.
zeros
((
N
,
C
,
H_out
,
W_out
))
for
i
in
range
(
H_out
):
for
j
in
range
(
W_out
):
if
adaptive
:
r_start
=
adaptive_start_index
(
i
,
H
,
ksize
[
0
])
r_end
=
adaptive_end_index
(
i
,
H
,
ksize
[
0
])
c_start
=
adaptive_start_index
(
j
,
W
,
ksize
[
1
])
c_end
=
adaptive_end_index
(
j
,
W
,
ksize
[
1
])
else
:
r_start
=
np
.
max
((
i
*
strides
[
0
]
-
paddings
[
0
],
0
))
r_end
=
np
.
min
((
i
*
strides
[
0
]
+
ksize
[
0
]
-
paddings
[
0
],
H
))
c_start
=
np
.
max
((
j
*
strides
[
1
]
-
paddings
[
1
],
0
))
...
...
@@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest):
def
setUp
(
self
):
self
.
init_test_case
()
self
.
init_global
()
self
.
init_adaptive
()
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
"float32"
)
output
,
mask
=
self
.
pool_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
)
self
.
paddings
,
self
.
global_pool
,
self
.
adaptive
)
output
=
output
.
astype
(
"float32"
)
mask
=
mask
.
astype
(
"int32"
)
...
...
@@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
'paddings'
:
self
.
paddings
,
'ksize'
:
self
.
ksize
,
'global_pooling'
:
self
.
global_pool
,
'adaptive'
:
self
.
adaptive
,
}
self
.
inputs
=
{
'X'
:
input
}
...
...
@@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest):
def
init_global
(
self
):
self
.
global_pool
=
False
def
init_adaptive
(
self
):
self
.
adaptive
=
False
class
TestCase1
(
TestMaxPoolWithIndex_Op
):
def
init_global
(
self
):
...
...
@@ -190,5 +239,15 @@ class TestCase7(TestCase6):
self
.
global_pool
=
False
class
TestCastAdaptive2d
(
TestCase6
):
def
init_adaptive
(
self
):
self
.
adaptive
=
True
class
TestCastAdaptive3d
(
TestMaxPoolWithIndex_Op
):
def
init_adaptive
(
self
):
self
.
adaptive
=
True
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
0 → 100644
浏览文件 @
740e1626
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
math
import
numpy
as
np
import
unittest
from
op_test
import
OpTest
class
TestPSROIPoolOp
(
OpTest
):
def
set_data
(
self
):
self
.
init_test_case
()
self
.
make_rois
()
self
.
calc_psroi_pool
()
self
.
inputs
=
{
'X'
:
self
.
x
,
'ROIs'
:
(
self
.
rois
[:,
1
:
5
],
self
.
rois_lod
)}
self
.
attrs
=
{
'output_channels'
:
self
.
output_channels
,
'spatial_scale'
:
self
.
spatial_scale
,
'pooled_height'
:
self
.
pooled_height
,
'pooled_width'
:
self
.
pooled_width
}
self
.
outputs
=
{
'Out'
:
self
.
outs
}
def
init_test_case
(
self
):
self
.
batch_size
=
3
self
.
channels
=
3
*
2
*
2
self
.
height
=
6
self
.
width
=
4
self
.
x_dim
=
[
self
.
batch_size
,
self
.
channels
,
self
.
height
,
self
.
width
]
self
.
spatial_scale
=
1.0
/
4.0
self
.
output_channels
=
3
self
.
pooled_height
=
2
self
.
pooled_width
=
2
self
.
x
=
np
.
random
.
random
(
self
.
x_dim
).
astype
(
'float32'
)
def
make_rois
(
self
):
rois
=
[]
self
.
rois_lod
=
[[]]
for
bno
in
range
(
self
.
batch_size
):
self
.
rois_lod
[
0
].
append
(
bno
+
1
)
for
i
in
range
(
bno
+
1
):
x1
=
np
.
random
.
random_integers
(
0
,
self
.
width
//
self
.
spatial_scale
-
self
.
pooled_width
)
y1
=
np
.
random
.
random_integers
(
0
,
self
.
height
//
self
.
spatial_scale
-
self
.
pooled_height
)
x2
=
np
.
random
.
random_integers
(
x1
+
self
.
pooled_width
,
self
.
width
//
self
.
spatial_scale
)
y2
=
np
.
random
.
random_integers
(
y1
+
self
.
pooled_height
,
self
.
height
//
self
.
spatial_scale
)
roi
=
[
bno
,
x1
,
y1
,
x2
,
y2
]
rois
.
append
(
roi
)
self
.
rois_num
=
len
(
rois
)
self
.
rois
=
np
.
array
(
rois
).
astype
(
'float32'
)
def
calc_psroi_pool
(
self
):
output_shape
=
(
self
.
rois_num
,
self
.
output_channels
,
self
.
pooled_height
,
self
.
pooled_width
)
out_data
=
np
.
zeros
(
output_shape
)
for
i
in
range
(
self
.
rois_num
):
roi
=
self
.
rois
[
i
]
roi_batch_id
=
int
(
roi
[
0
])
roi_start_w
=
round
(
roi
[
1
])
*
self
.
spatial_scale
roi_start_h
=
round
(
roi
[
2
])
*
self
.
spatial_scale
roi_end_w
=
(
round
(
roi
[
3
])
+
1.
)
*
self
.
spatial_scale
roi_end_h
=
(
round
(
roi
[
4
])
+
1.
)
*
self
.
spatial_scale
roi_height
=
max
(
roi_end_h
-
roi_start_h
,
0.1
)
roi_width
=
max
(
roi_end_w
-
roi_start_w
,
0.1
)
bin_size_h
=
roi_height
/
float
(
self
.
pooled_height
)
bin_size_w
=
roi_width
/
float
(
self
.
pooled_width
)
x_i
=
self
.
x
[
roi_batch_id
]
for
c
in
range
(
self
.
output_channels
):
for
ph
in
range
(
self
.
pooled_height
):
for
pw
in
range
(
self
.
pooled_width
):
hstart
=
int
(
math
.
floor
(
float
(
ph
)
*
bin_size_h
+
roi_start_h
))
wstart
=
int
(
math
.
floor
(
float
(
pw
)
*
bin_size_w
+
roi_start_w
))
hend
=
int
(
math
.
ceil
(
float
(
ph
+
1
)
*
bin_size_h
+
roi_start_h
))
wend
=
int
(
math
.
ceil
(
float
(
pw
+
1
)
*
bin_size_w
+
roi_start_w
))
hstart
=
min
(
max
(
hstart
,
0
),
self
.
height
)
hend
=
min
(
max
(
hend
,
0
),
self
.
height
)
wstart
=
min
(
max
(
wstart
,
0
),
self
.
width
)
wend
=
min
(
max
(
wend
,
0
),
self
.
width
)
c_in
=
(
c
*
self
.
pooled_height
+
ph
)
*
self
.
pooled_width
+
pw
is_empty
=
(
hend
<=
hstart
)
or
(
wend
<=
wstart
)
out_sum
=
0.
for
ih
in
range
(
hstart
,
hend
):
for
iw
in
range
(
wstart
,
wend
):
out_sum
+=
x_i
[
c_in
,
ih
,
iw
]
bin_area
=
(
hend
-
hstart
)
*
(
wend
-
wstart
)
out_data
[
i
,
c
,
ph
,
pw
]
=
0.
if
is_empty
else
(
out_sum
/
float
(
bin_area
))
self
.
outs
=
out_data
.
astype
(
'float32'
)
def
setUp
(
self
):
self
.
op_type
=
'psroi_pool'
self
.
set_data
()
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad
(
self
):
self
.
check_grad
([
'X'
],
'Out'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录