Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a412826c
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 2 年 前同步成功
通知
2325
Star
20933
Fork
5424
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a412826c
编写于
11月 05, 2018
作者:
Z
Zhen Wang
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into nlp-dam
上级
1da03005
61fdc38e
变更
122
隐藏空白更改
内联
并排
Showing
122 changed file
with
4617 addition
and
902 deletion
+4617
-902
CMakeLists.txt
CMakeLists.txt
+0
-2
README.md
README.md
+11
-11
paddle/fluid/API.spec
paddle/fluid/API.spec
+6
-4
paddle/fluid/CMakeLists.txt
paddle/fluid/CMakeLists.txt
+3
-5
paddle/fluid/framework/details/CMakeLists.txt
paddle/fluid/framework/details/CMakeLists.txt
+5
-2
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+3
-3
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+2
-1
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+1
-221
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+271
-0
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+11
-0
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+2
-0
paddle/fluid/framework/details/computation_op_handle.cc
paddle/fluid/framework/details/computation_op_handle.cc
+1
-1
paddle/fluid/framework/details/data_balance_op_handle.cc
paddle/fluid/framework/details/data_balance_op_handle.cc
+3
-3
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
...uid/framework/details/fast_threaded_ssa_graph_executor.cc
+8
-8
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
...luid/framework/details/fast_threaded_ssa_graph_executor.h
+2
-1
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+165
-0
paddle/fluid/framework/details/gather_op_handle.cc
paddle/fluid/framework/details/gather_op_handle.cc
+2
-2
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+1
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+2
-1
paddle/fluid/framework/details/rpc_op_handle.cc
paddle/fluid/framework/details/rpc_op_handle.cc
+1
-1
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+4
-4
paddle/fluid/framework/details/sequential_execution_pass.cc
paddle/fluid/framework/details/sequential_execution_pass.cc
+109
-0
paddle/fluid/framework/details/sequential_execution_pass.h
paddle/fluid/framework/details/sequential_execution_pass.h
+34
-0
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
...le/fluid/framework/details/threaded_ssa_graph_executor.cc
+8
-9
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+1
-1
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+2
-0
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
+2
-1
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
...e/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+3
-0
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+58
-0
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
+34
-0
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
...e/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
+123
-0
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+3
-0
paddle/fluid/framework/ir/graph.cc
paddle/fluid/framework/ir/graph.cc
+59
-0
paddle/fluid/framework/ir/graph_pattern_detector.cc
paddle/fluid/framework/ir/graph_pattern_detector.cc
+17
-4
paddle/fluid/framework/lod_tensor.cc
paddle/fluid/framework/lod_tensor.cc
+1
-1
paddle/fluid/framework/lod_tensor_array.h
paddle/fluid/framework/lod_tensor_array.h
+0
-74
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+16
-15
paddle/fluid/framework/operator.h
paddle/fluid/framework/operator.h
+1
-1
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+2
-4
paddle/fluid/framework/tensor_test.cc
paddle/fluid/framework/tensor_test.cc
+13
-0
paddle/fluid/framework/tensor_util.cc
paddle/fluid/framework/tensor_util.cc
+6
-0
paddle/fluid/inference/CMakeLists.txt
paddle/fluid/inference/CMakeLists.txt
+3
-0
paddle/fluid/inference/analysis/CMakeLists.txt
paddle/fluid/inference/analysis/CMakeLists.txt
+11
-16
paddle/fluid/inference/analysis/analyzer.h
paddle/fluid/inference/analysis/analyzer.h
+1
-0
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+3
-0
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+10
-32
paddle/fluid/inference/api/api_impl_tester.cc
paddle/fluid/inference/api/api_impl_tester.cc
+8
-6
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
...luid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+2
-2
paddle/fluid/inference/api/demo_ci/run.sh
paddle/fluid/inference/api/demo_ci/run.sh
+1
-1
paddle/fluid/inference/test.cmake
paddle/fluid/inference/test.cmake
+31
-0
paddle/fluid/inference/tests/api/CMakeLists.txt
paddle/fluid/inference/tests/api/CMakeLists.txt
+0
-14
paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+112
-0
paddle/fluid/operators/affine_grid_op.cc
paddle/fluid/operators/affine_grid_op.cc
+233
-0
paddle/fluid/operators/affine_grid_op.h
paddle/fluid/operators/affine_grid_op.h
+190
-0
paddle/fluid/operators/delete_var_op.cc
paddle/fluid/operators/delete_var_op.cc
+7
-1
paddle/fluid/operators/gather_op.cc
paddle/fluid/operators/gather_op.cc
+4
-2
paddle/fluid/operators/gather_op.cu
paddle/fluid/operators/gather_op.cu
+8
-2
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+132
-0
paddle/fluid/operators/grid_sampler_op.cc
paddle/fluid/operators/grid_sampler_op.cc
+203
-0
paddle/fluid/operators/grid_sampler_op.h
paddle/fluid/operators/grid_sampler_op.h
+322
-0
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-2
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+53
-0
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+63
-0
paddle/fluid/operators/math/jit_gen.cc
paddle/fluid/operators/math/jit_gen.cc
+90
-0
paddle/fluid/operators/math/jit_gen.h
paddle/fluid/operators/math/jit_gen.h
+80
-0
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+2
-1
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+67
-54
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+1
-1
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+3
-3
paddle/fluid/operators/math/jit_kernel_macro.h
paddle/fluid/operators/math/jit_kernel_macro.h
+93
-32
paddle/fluid/operators/math/jit_kernel_rnn.cc
paddle/fluid/operators/math/jit_kernel_rnn.cc
+20
-20
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+7
-7
paddle/fluid/operators/math/pooling.cc
paddle/fluid/operators/math/pooling.cc
+14
-8
paddle/fluid/operators/math/pooling.cu
paddle/fluid/operators/math/pooling.cu
+30
-25
paddle/fluid/operators/math/pooling.h
paddle/fluid/operators/math/pooling.h
+4
-4
paddle/fluid/operators/math/sequence_pooling.cc
paddle/fluid/operators/math/sequence_pooling.cc
+44
-4
paddle/fluid/operators/math/sequence_pooling.cu
paddle/fluid/operators/math/sequence_pooling.cu
+1
-1
paddle/fluid/operators/math/sequence_pooling.h
paddle/fluid/operators/math/sequence_pooling.h
+1
-1
paddle/fluid/operators/pool_cudnn_op.cu.cc
paddle/fluid/operators/pool_cudnn_op.cu.cc
+6
-2
paddle/fluid/operators/pool_op.cc
paddle/fluid/operators/pool_op.cc
+29
-0
paddle/fluid/operators/pool_op.h
paddle/fluid/operators/pool_op.h
+8
-6
paddle/fluid/operators/sequence_pool_op.cc
paddle/fluid/operators/sequence_pool_op.cc
+1
-0
paddle/fluid/operators/sequence_pool_op.h
paddle/fluid/operators/sequence_pool_op.h
+11
-6
paddle/fluid/operators/sign_op.cc
paddle/fluid/operators/sign_op.cc
+2
-1
paddle/fluid/operators/sign_op.cu
paddle/fluid/operators/sign_op.cu
+5
-1
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+6
-0
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+166
-21
paddle/fluid/operators/spp_op.h
paddle/fluid/operators/spp_op.h
+5
-3
paddle/fluid/operators/sum_op.cc
paddle/fluid/operators/sum_op.cc
+5
-3
paddle/fluid/platform/cudnn_helper.h
paddle/fluid/platform/cudnn_helper.h
+30
-3
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+17
-19
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+3
-4
paddle/fluid/platform/dynload/cudnn.h
paddle/fluid/platform/dynload/cudnn.h
+45
-38
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+7
-0
paddle/scripts/paddle_build.sh
paddle/scripts/paddle_build.sh
+3
-7
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+6
-2
python/paddle/fluid/layers/control_flow.py
python/paddle/fluid/layers/control_flow.py
+18
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+346
-39
python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
...s/book/high-level-api/image_classification/CMakeLists.txt
+16
-4
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+8
-1
python/paddle/fluid/tests/unittests/dist_mnist.py
python/paddle/fluid/tests/unittests/dist_mnist.py
+4
-2
python/paddle/fluid/tests/unittests/dist_save_load.py
python/paddle/fluid/tests/unittests/dist_save_load.py
+174
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+3
-1
python/paddle/fluid/tests/unittests/test_affine_grid_op.py
python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+79
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+26
-41
python/paddle/fluid/tests/unittests/test_dist_save_load.py
python/paddle/fluid/tests/unittests/test_dist_save_load.py
+90
-0
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+3
-2
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+118
-46
python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+123
-0
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+25
-0
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
...n/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+61
-4
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
...fluid/tests/unittests/test_parallel_executor_seresnext.py
+40
-0
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+2
-0
python/paddle/fluid/tests/unittests/test_pool2d_op.py
python/paddle/fluid/tests/unittests/test_pool2d_op.py
+27
-8
python/paddle/fluid/tests/unittests/test_pool3d_op.py
python/paddle/fluid/tests/unittests/test_pool3d_op.py
+28
-8
python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
...paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
+130
-0
python/paddle/fluid/tests/unittests/test_seq_pool.py
python/paddle/fluid/tests/unittests/test_seq_pool.py
+14
-0
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
...uid/tests/unittests/test_softmax_with_cross_entropy_op.py
+23
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+22
-5
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+28
-0
python/setup.py.in
python/setup.py.in
+1
-1
未找到文件。
CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -62,13 +62,11 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option
(
USE_EIGEN_FOR_BLAS
"Use matrix multiplication in Eigen"
OFF
)
option
(
EIGEN_USE_THREADS
"Compile with multi-threaded Eigen"
OFF
)
option
(
WITH_ARM_FP16
"Use half precision support on armv8.2-a cpu"
OFF
)
option
(
WITH_FAST_BUNDLE_TEST
"Bundle tests that can be run in a single process together to reduce launch overhead"
OFF
)
option
(
WITH_CONTRIB
"Compile the third-party contributation"
OFF
)
option
(
REPLACE_ENFORCE_GLOG
"Replace PADDLE_ENFORCE with glog/CHECK for better debug."
OFF
)
option
(
WITH_ANAKIN
"Compile with Anakin library"
OFF
)
option
(
WITH_GRPC
"Use grpc as the default rpc framework"
${
WITH_DISTRIBUTE
}
)
option
(
WITH_BRPC_RDMA
"Use brpc rdma as the rpc protocal"
OFF
)
option
(
WITH_INFERENCE
"Compile fluid inference library"
ON
)
option
(
ON_INFER
"Turn on inference optimization."
OFF
)
option
(
WITH_INFERENCE_API_TEST
"Test fluid inference high-level api interface"
OFF
)
option
(
WITH_SYSTEM_BLAS
"Use system blas library"
OFF
)
...
...
README.md
浏览文件 @
a412826c
...
...
@@ -2,8 +2,8 @@
[

](https://travis-ci.org/PaddlePaddle/Paddle)
[

](http://paddlepaddle.org/documentation/docs/en/1.
0
/getstarted/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html)
[

](http://paddlepaddle.org/documentation/docs/en/1.
1
/getstarted/index_en.html)
[

](http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html)
[

](https://github.com/PaddlePaddle/Paddle/releases)
[

](LICENSE)
...
...
@@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our
[
release announcement
](
https://github.com/PaddlePaddle/Paddle/releases
)
to track the latest feature of PaddlePaddle.
### Latest PaddlePaddle Release: [Fluid 1.
0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0
)
### Latest PaddlePaddle Release: [Fluid 1.
1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1
)
### Install Latest Stable Release:
```
# Linux CPU
...
...
@@ -27,9 +27,9 @@ pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==1.
0.1
.post87
pip install paddlepaddle-gpu==1.
1.0
.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==1.
0.1
.post85
pip install paddlepaddle-gpu==1.
1.0
.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
...
...
@@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85
## Installation
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html
)
on our website.
It is recommended to read
[
this doc
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html
)
on our website.
## Documentation
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
0.0
/getstarted/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/beginners_guide/index.html
)
documentation.
We provide
[
English
](
http://paddlepaddle.org/documentation/docs/en/1.
1
/getstarted/index_en.html
)
and
[
Chinese
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/beginners_guide/index.html
)
documentation.
-
[
Deep Learning 101
](
https://github.com/PaddlePaddle/book
)
You might want to start from this online interactive book that can run in a Jupyter Notebook.
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/user_guides/howto/training/cluster_howto.html
)
-
[
Distributed Training
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/user_guides/howto/training/cluster_howto.html
)
You can run distributed training jobs on MPI clusters.
-
[
Python API
](
http://paddlepaddle.org/documentation/api/zh/1.
0
/fluid.html
)
-
[
Python API
](
http://paddlepaddle.org/documentation/api/zh/1.
1
/fluid.html
)
Our new API enables much shorter programs.
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/zh/1.
0
/advanced_usage/development/contribute_to_paddle.html
)
-
[
How to Contribute
](
http://paddlepaddle.org/documentation/docs/zh/1.
1
/advanced_usage/development/contribute_to_paddle.html
)
We appreciate your contributions!
...
...
paddle/fluid/API.spec
浏览文件 @
a412826c
...
...
@@ -64,11 +64,11 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', '
paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None))
paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None))
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'
, 'is_test'], varargs=None, keywords=None, defaults=(False,)
)
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'
], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, Non
e))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'
], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, Non
e))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'
, 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, Tru
e))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'
, 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, Tru
e))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
...
...
@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'
], varargs=None, keywords=None, defaults=(False, -100
))
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'
, 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False
))
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
...
...
@@ -174,9 +174,11 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
...
...
paddle/fluid/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -9,8 +9,6 @@ add_subdirectory(pybind)
add_subdirectory
(
recordio
)
endif
(
NOT WIN32
)
if
(
WITH_INFERENCE
)
# NOTE: please add subdirectory inference at last.
add_subdirectory
(
inference
)
add_subdirectory
(
train
)
endif
()
# NOTE: please add subdirectory inference at last.
add_subdirectory
(
inference
)
add_subdirectory
(
train
)
paddle/fluid/framework/details/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -35,13 +35,15 @@ if(WITH_GPU)
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass
)
endif
()
cc_library
(
sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass
)
cc_library
(
multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle
)
if
(
WITH_GPU
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass
sequential_execution_pass
)
else
()
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto
)
cc_library
(
ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto
sequential_execution_pass
)
endif
()
cc_library
(
threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
...
...
@@ -56,6 +58,7 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
# device_context reduce_op_handle )
cc_library
(
fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context
)
cc_test
(
fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle
)
cc_library
(
build_strategy SRCS build_strategy.cc DEPS
graph_viz_pass multi_devices_graph_pass
...
...
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
nccl_ctxs_
(
ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctxes_
[
p
]
=
nccl_ctxs_
->
DevCtx
(
p
);
this
->
SetDeviceContext
(
p
,
nccl_ctxs_
->
DevCtx
(
p
)
);
}
}
}
...
...
@@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void
AllReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
begin
()
->
second
);
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
c
begin
()
->
second
);
if
(
NoDummyInputSize
()
==
1
)
{
return
;
// No need to all reduce when GPU count = 1;
...
...
@@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() {
*
local_scopes_
[
i
]
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
auto
&
p
=
places_
[
i
];
auto
*
var
=
scope
.
FindVar
(
out_var_handles
[
i
]
->
name_
);
auto
*
dev_ctx
=
dev_ctxes_
[
p
]
;
auto
*
dev_ctx
=
dev_ctxes_
.
at
(
p
)
;
RunAndRecordEvent
(
p
,
[
&
trg
,
var
,
dev_ctx
,
p
]
{
auto
&
tensor_gpu
=
*
var
->
GetMutable
<
framework
::
LoDTensor
>
();
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
a412826c
...
...
@@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase {
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p_ctx
:
nccl_ctxs_
->
contexts_
)
{
dev_ctxes_
[
platform
::
CUDAPlace
(
p_ctx
.
first
)]
=
p_ctx
.
second
.
ctx_
.
get
();
this
->
SetDeviceContext
(
platform
::
CUDAPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
a412826c
...
...
@@ -12,232 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
struct
TestBroadcastOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
gpu_list_
;
bool
use_gpu_
;
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CUDAPlace
(
i
);
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
p
));
}
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
gpu_list_
));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
int
count
=
8
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CPUPlace
();
gpu_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
}
void
InitBroadcastOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"out"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
param_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
gpu_list_
));
#endif
}
std
::
unique_ptr
<
ir
::
Node
>
v
=
ir
::
CreateNodeForTest
(
"node1"
,
ir
::
Node
::
Type
::
kVariable
);
auto
*
in_var_handle
=
new
VarHandle
(
v
.
get
(),
1
,
input_scope_idx
,
"input"
,
gpu_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v2
=
ir
::
CreateNodeForTest
(
"node2"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v2
.
get
()));
DummyVarHandle
*
dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddInput
(
dummy_var_handle
);
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
op_handle_
->
SetDeviceContext
(
gpu_list_
[
j
],
ctxs_
[
j
].
get
());
}
std
::
unique_ptr
<
ir
::
Node
>
v3
=
ir
::
CreateNodeForTest
(
"node3"
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
v3
.
get
(),
2
,
j
,
"out"
,
gpu_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v4
=
ir
::
CreateNodeForTest
(
"node4"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v4
.
get
()));
DummyVarHandle
*
out_dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
out_dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddOutput
(
out_dummy_var_handle
);
}
void
TestBroadcastLodTensor
(
size_t
input_scope_idx
)
{
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_lod_tensor
=
in_var
->
GetMutable
<
f
::
LoDTensor
>
();
in_lod_tensor
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
f
::
LoD
lod
{{
0
,
10
,
20
}};
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
in_lod_tensor
);
in_lod_tensor
->
set_lod
(
lod
);
in_lod_tensor
->
Resize
(
kDims
);
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
out_tensor
=
out_var
->
Get
<
f
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
out_tensor
.
lod
(),
lod
,
"lod is not equal."
);
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
out_tensor
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
mutable_data
<
float
>
(
cpu_place
);
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
}
void
TestBroadcastSelectedRows
(
size_t
input_scope_idx
)
{
auto
in_var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
"input"
);
PADDLE_ENFORCE_NOT_NULL
(
in_var
);
auto
in_selected_rows
=
in_var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
in_selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
gpu_list_
[
input_scope_idx
]);
int
height
=
static_cast
<
int
>
(
kDims
[
0
])
*
2
;
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
in_selected_rows
->
set_height
(
height
);
in_selected_rows
->
set_rows
(
rows
);
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
;
}
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
value
);
op_handle_
->
Run
(
false
);
WaitAll
();
p
::
CPUPlace
cpu_place
;
for
(
size_t
j
=
0
;
j
<
gpu_list_
.
size
();
++
j
)
{
auto
out_var
=
param_scopes_
[
j
]
->
FindVar
(
"out"
);
PADDLE_ENFORCE_NOT_NULL
(
out_var
);
auto
&
out_select_rows
=
out_var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
out_select_rows
.
value
();
PADDLE_ENFORCE_EQ
(
out_select_rows
.
height
(),
height
,
"height is not equal."
);
for
(
size_t
k
=
0
;
k
<
out_select_rows
.
rows
().
size
();
++
k
)
{
PADDLE_ENFORCE_EQ
(
out_select_rows
.
rows
()[
k
],
rows
[
k
]);
}
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
}
};
TEST
(
BroadcastTester
,
TestCPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.h
0 → 100644
浏览文件 @
a412826c
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
struct
TestBroadcastOpHandle
{
std
::
vector
<
std
::
unique_ptr
<
p
::
DeviceContext
>>
ctxs_
;
std
::
vector
<
Scope
*>
local_scopes_
;
std
::
vector
<
Scope
*>
param_scopes_
;
Scope
g_scope_
;
std
::
unique_ptr
<
OpHandleBase
>
op_handle_
;
std
::
vector
<
std
::
unique_ptr
<
VarHandleBase
>>
vars_
;
std
::
vector
<
p
::
Place
>
place_list_
;
bool
use_gpu_
;
#ifdef PADDLE_WITH_CUDA
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
}
#ifdef PADDLE_WITH_CUDA
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-gpu Broadcast, because the CUDA "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CUDAPlace
(
i
);
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CUDADeviceContext
(
p
));
}
nccl_ctxs_
.
reset
(
new
platform
::
NCCLContextMap
(
place_list_
));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
int
count
=
8
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
CPUPlace
();
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#ifdef PADDLE_WITH_CUDA
nccl_ctxs_
.
reset
(
nullptr
);
#endif
}
}
void
InitBroadcastOp
(
size_t
input_scope_idx
)
{
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
local_scope
.
Var
(
"out"
);
param_scopes_
.
emplace_back
(
&
local_scope
);
}
param_scopes_
[
input_scope_idx
]
->
Var
(
"input"
);
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not support."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
BroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
));
#endif
}
std
::
unique_ptr
<
ir
::
Node
>
v
=
ir
::
CreateNodeForTest
(
"node1"
,
ir
::
Node
::
Type
::
kVariable
);
auto
*
in_var_handle
=
new
VarHandle
(
v
.
get
(),
1
,
input_scope_idx
,
"input"
,
place_list_
[
input_scope_idx
]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v2
=
ir
::
CreateNodeForTest
(
"node2"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v2
.
get
()));
DummyVarHandle
*
dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddInput
(
dummy_var_handle
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
op_handle_
->
SetDeviceContext
(
place_list_
[
j
],
ctxs_
[
j
].
get
());
}
std
::
unique_ptr
<
ir
::
Node
>
v3
=
ir
::
CreateNodeForTest
(
"node3"
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
v3
.
get
(),
2
,
j
,
"out"
,
place_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
// add dummy var
std
::
unique_ptr
<
ir
::
Node
>
v4
=
ir
::
CreateNodeForTest
(
"node4"
,
ir
::
Node
::
Type
::
kVariable
);
vars_
.
emplace_back
(
new
DummyVarHandle
(
v4
.
get
()));
DummyVarHandle
*
out_dummy_var_handle
=
static_cast
<
DummyVarHandle
*>
(
vars_
.
back
().
get
());
out_dummy_var_handle
->
ClearGeneratedOp
();
op_handle_
->
AddOutput
(
out_dummy_var_handle
);
}
std
::
vector
<
float
>
InitLoDTensor
(
const
std
::
string
&
varname
,
size_t
input_scope_idx
,
const
f
::
LoD
&
lod
,
float
val_scalar
=
0.0
)
{
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
lod_tensor
=
var
->
GetMutable
<
f
::
LoDTensor
>
();
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
+
val_scalar
;
}
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
lod_tensor
);
lod_tensor
->
set_lod
(
lod
);
lod_tensor
->
Resize
(
kDims
);
return
send_vector
;
}
std
::
vector
<
float
>
InitSelectedRows
(
const
std
::
string
&
varname
,
size_t
input_scope_idx
,
const
std
::
vector
<
int64_t
>&
rows
,
int
height
,
float
value_scalar
=
0.0
)
{
std
::
vector
<
float
>
send_vector
(
static_cast
<
size_t
>
(
f
::
product
(
kDims
)));
for
(
size_t
k
=
0
;
k
<
send_vector
.
size
();
++
k
)
{
send_vector
[
k
]
=
k
+
value_scalar
;
}
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
selected_rows
=
var
->
GetMutable
<
f
::
SelectedRows
>
();
auto
value
=
selected_rows
->
mutable_value
();
value
->
mutable_data
<
float
>
(
kDims
,
place_list_
[
input_scope_idx
]);
selected_rows
->
set_height
(
height
);
selected_rows
->
set_rows
(
rows
);
paddle
::
framework
::
TensorFromVector
<
float
>
(
send_vector
,
*
(
ctxs_
[
input_scope_idx
]),
value
);
return
send_vector
;
}
void
SelectedRowsEqual
(
const
std
::
string
&
varname
,
int
input_scope_idx
,
const
std
::
vector
<
float
>&
send_vector
,
const
std
::
vector
<
int64_t
>&
rows
,
int
height
)
{
auto
var
=
param_scopes_
[
input_scope_idx
]
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
&
selected_rows
=
var
->
Get
<
f
::
SelectedRows
>
();
auto
rt
=
selected_rows
.
value
();
PADDLE_ENFORCE_EQ
(
selected_rows
.
height
(),
height
,
"height is not equal."
);
for
(
size_t
k
=
0
;
k
<
selected_rows
.
rows
().
size
();
++
k
)
{
PADDLE_ENFORCE_EQ
(
selected_rows
.
rows
()[
k
],
rows
[
k
]);
}
p
::
CPUPlace
cpu_place
;
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
rt
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
f
::
product
(
kDims
);
++
i
)
{
ASSERT_NEAR
(
ct
[
i
],
send_vector
[
i
],
1e-5
);
}
}
void
LoDTensorEqual
(
const
std
::
string
&
varname
,
const
std
::
vector
<
float
>&
send_vec
,
const
f
::
LoD
&
lod
,
framework
::
Scope
*
scope
)
{
p
::
CPUPlace
cpu_place
;
auto
var
=
scope
->
FindVar
(
varname
);
PADDLE_ENFORCE_NOT_NULL
(
var
);
auto
tensor
=
var
->
Get
<
f
::
LoDTensor
>
();
PADDLE_ENFORCE_EQ
(
tensor
.
lod
(),
lod
,
"lod is not equal."
);
f
::
Tensor
result_tensor
;
f
::
TensorCopySync
(
tensor
,
cpu_place
,
&
result_tensor
);
float
*
ct
=
result_tensor
.
mutable_data
<
float
>
(
cpu_place
);
for
(
int64_t
k
=
0
;
k
<
f
::
product
(
kDims
);
++
k
)
{
ASSERT_NEAR
(
ct
[
k
],
send_vec
[
k
],
1e-5
);
}
}
void
TestBroadcastLodTensor
(
size_t
input_scope_idx
)
{
f
::
LoD
lod
{{
0
,
10
,
20
}};
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
LoDTensorEqual
(
"out"
,
send_vector
,
lod
,
param_scopes_
[
j
]);
}
}
void
TestBroadcastSelectedRows
(
size_t
input_scope_idx
)
{
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
SelectedRowsEqual
(
"out"
,
input_scope_idx
,
send_vector
,
rows
,
height
);
}
}
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
a412826c
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
...
...
@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
public:
explicit
ParallelExecutorPassBuilder
(
const
BuildStrategy
&
strategy
)
:
ir
::
PassBuilder
(),
strategy_
(
strategy
)
{
if
(
strategy_
.
enable_sequential_execution_
)
{
AppendPass
(
"sequential_execution_pass"
);
}
// Add a graph viz pass to record a graph.
if
(
!
strategy_
.
debug_graphviz_path_
.
empty
())
{
auto
viz_pass
=
AppendPass
(
"graph_viz_pass"
);
...
...
@@ -110,6 +115,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
pass
->
Erase
(
"nccl_ctxs"
);
pass
->
SetNotOwned
<
platform
::
NCCLContextMap
>
(
"nccl_ctxs"
,
nctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"sequential_execution_pass"
)
{
pass
->
Erase
(
kAllOpDescs
);
pass
->
Set
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
,
new
std
::
vector
<
OpDesc
*>
(
main_program
.
Block
(
0
).
AllOps
()));
}
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
}
...
...
@@ -125,3 +135,4 @@ USE_PASS(multi_batch_merge_pass);
USE_PASS
(
multi_devices_pass
);
USE_PASS
(
multi_devices_check_pass
);
USE_PASS
(
multi_devices_print_pass
);
USE_PASS
(
sequential_execution_pass
);
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
a412826c
...
...
@@ -69,6 +69,8 @@ struct BuildStrategy {
bool
enable_data_balance_
{
false
};
bool
enable_sequential_execution_
{
false
};
bool
fuse_broadcast_op_
{
false
};
// User normally doesn't need to call this API.
...
...
paddle/fluid/framework/details/computation_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() {
bool
ComputationOpHandle
::
NeedWait
(
VarHandleBase
*
in_var
)
{
bool
need_wait
=
in_var
&&
in_var
->
GeneratedOp
()
&&
in_var
->
GeneratedOp
()
->
DeviceContext
(
place_
)
!=
dev_ctxes_
[
place_
]
;
in_var
->
GeneratedOp
()
->
DeviceContext
(
place_
)
!=
dev_ctxes_
.
at
(
place_
)
;
return
need_wait
;
}
...
...
paddle/fluid/framework/details/data_balance_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle(
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
)
{
if
(
ctxs
)
{
for
(
auto
&
p
:
places_
)
{
this
->
dev_ctxes_
[
p
]
=
ctxs
->
DevCtx
(
p
);
this
->
SetDeviceContext
(
p
,
ctxs
->
DevCtx
(
p
)
);
}
}
}
...
...
@@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT
(
places_
.
size
(),
1
,
"Data balance can only be enabled when the number of "
"places to run larger than 1."
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
inputs_
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
auto
in_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Inputs
()
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
()
);
PADDLE_ENFORCE
(
in_var_handles
.
size
()
%
places_
.
size
()
==
0
);
PADDLE_ENFORCE_EQ
(
in_var_handles
.
size
(),
out_var_handles
.
size
(),
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
浏览文件 @
a412826c
...
...
@@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
size_t
num_complete
=
0
;
remaining_
=
0
;
BlockingQueue
<
size_t
>
complete_q
;
auto
complete_q
=
std
::
make_shared
<
BlockingQueue
<
size_t
>>
()
;
for
(
auto
op
:
bootstrap_ops_
)
{
RunOpAsync
(
op_deps
.
get
(),
op
,
&
complete_q
);
RunOpAsync
(
op_deps
.
get
(),
op
,
complete_q
);
}
while
(
num_complete
!=
op_deps
->
size
())
{
size_t
num_comp
=
complete_q
.
Pop
();
size_t
num_comp
=
complete_q
->
Pop
();
if
(
num_comp
==
-
1UL
)
{
int
remaining
=
0
;
while
(
true
)
{
...
...
@@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
break
;
}
for
(
int
i
=
0
;
i
<
remaining
;
++
i
)
{
complete_q
.
Pop
();
complete_q
->
Pop
();
}
}
exception_
.
ReThrow
();
...
...
@@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run(
}
void
FastThreadedSSAGraphExecutor
::
RunOpAsync
(
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
,
OpHandleBase
*
op
,
BlockingQueue
<
size_t
>
*
complete_q
)
{
OpHandleBase
*
op
,
const
std
::
shared_ptr
<
BlockingQueue
<
size_t
>>
&
complete_q
)
{
++
remaining_
;
this
->
pool_
.
enqueue
([
=
]
{
OpHandleBase
*
op_to_run
=
op
;
...
...
@@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
if
(
op_to_run
==
nullptr
)
{
op_to_run
=
pending_op
;
}
else
{
this
->
RunOpAsync
(
op_deps
,
pending_op
,
complete_q
);
RunOpAsync
(
op_deps
,
pending_op
,
complete_q
);
}
}
}
...
...
@@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
}
void
FastThreadedSSAGraphExecutor
::
PrepareAtomicOpDeps
()
{
atomic_op_deps_
=
pool_
.
enqueue
([
&
]
{
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
=
new
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
;
auto
*
op_deps
=
new
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
;
for
(
auto
&
pair
:
op_deps_
)
{
(
*
op_deps
)[
pair
.
first
]
=
pair
.
second
;
}
...
...
paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
浏览文件 @
a412826c
...
...
@@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
std
::
atomic
<
int
>
remaining_
;
void
RunOpAsync
(
std
::
unordered_map
<
OpHandleBase
*
,
std
::
atomic
<
int
>>
*
op_deps
,
OpHandleBase
*
op
,
BlockingQueue
<
size_t
>
*
complete_q
);
OpHandleBase
*
op
,
const
std
::
shared_ptr
<
BlockingQueue
<
size_t
>>
&
complete_q
);
void
PrepareAtomicOpDeps
();
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
0 → 100644
浏览文件 @
a412826c
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
std
::
vector
<
std
::
string
>
out_varnames_
;
void
InitFusedBroadcastOp
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
// initialize scope and var
for
(
size_t
i
=
0
;
i
<
place_list_
.
size
();
++
i
)
{
local_scopes_
.
push_back
(
&
(
g_scope_
.
NewScope
()));
Scope
&
local_scope
=
local_scopes_
.
back
()
->
NewScope
();
*
local_scopes_
.
back
()
->
Var
(
details
::
kLocalExecScopeName
)
->
GetMutable
<
Scope
*>
()
=
&
local_scope
;
for
(
size_t
j
=
0
;
j
<
input_scope_idxes
.
size
();
++
j
)
{
local_scope
.
Var
(
"out_var"
+
j
);
if
(
i
==
j
)
local_scope
.
Var
(
"in_var"
+
j
);
}
param_scopes_
.
emplace_back
(
&
local_scope
);
}
// create op handle node
std
::
unique_ptr
<
ir
::
Node
>
n
=
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
);
if
(
use_gpu_
)
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
PADDLE_THROW
(
"CUDA is not supported."
);
#endif
}
else
{
#ifdef PADDLE_WITH_CUDA
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
()));
#else
op_handle_
.
reset
(
new
FusedBroadcastOpHandle
(
n
.
get
(),
local_scopes_
,
place_list_
));
#endif
}
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
// add input var handle
std
::
unique_ptr
<
ir
::
Node
>
in_node
=
ir
::
CreateNodeForTest
(
"in_node"
+
i
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
in_var_handle
=
new
VarHandle
(
in_node
.
get
(),
1
,
input_scope_idxes
[
i
],
"in_var"
+
i
,
place_list_
[
input_scope_idxes
[
i
]]);
vars_
.
emplace_back
(
in_var_handle
);
op_handle_
->
AddInput
(
in_var_handle
);
// add output var handle
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
std
::
unique_ptr
<
ir
::
Node
>
out_node
=
ir
::
CreateNodeForTest
(
"out_node"
+
i
,
ir
::
Node
::
Type
::
kVariable
);
VarHandle
*
out_var_handle
=
new
VarHandle
(
out_node
.
get
(),
2
,
j
,
"out_var"
+
i
,
place_list_
[
j
]);
vars_
.
emplace_back
(
out_var_handle
);
op_handle_
->
AddOutput
(
out_var_handle
);
}
}
}
void
TestFusedBroadcastLoDTensor
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
std
::
vector
<
std
::
vector
<
float
>>
send_vec
;
f
::
LoD
lod
{{
0
,
10
,
20
}};
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
varname
(
"in_var"
+
i
);
float
val_scalar
=
static_cast
<
float
>
(
i
);
send_vec
.
push_back
(
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
&
varname
(
"out_var"
+
i
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
LoDTensorEqual
(
varname
,
send_vec
[
i
],
lod
,
param_scopes_
[
j
]);
}
}
}
void
TestFusedBroadcastSelectedRows
(
std
::
vector
<
size_t
>
input_scope_idxes
)
{
std
::
vector
<
std
::
vector
<
float
>>
send_vector
;
std
::
vector
<
int64_t
>
rows
{
0
,
1
,
2
,
3
,
3
,
0
,
14
,
7
,
3
,
1
,
2
,
4
,
6
,
3
,
1
,
1
,
1
,
1
,
3
,
7
};
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
varname
(
"in_var"
+
i
);
float
val_scalar
=
static_cast
<
float
>
(
i
);
send_vector
.
push_back
(
InitSelectedRows
(
varname
,
input_scope_idxes
[
i
],
rows
,
height
,
val_scalar
));
}
op_handle_
->
Run
(
false
);
WaitAll
();
for
(
size_t
i
=
0
;
i
<
input_scope_idxes
.
size
();
++
i
)
{
const
std
::
string
&
varname
(
"out_var"
+
i
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
SelectedRowsEqual
(
varname
,
input_scope_idxes
[
i
],
send_vector
[
i
],
rows
,
height
);
}
}
}
};
TEST
(
FusedBroadcastTester
,
CPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
TEST
(
FusedBroadcastTester
,
CPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
false
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#ifdef PADDLE_WITH_CUDA
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
TEST
(
FusedBroadcastTester
,
GPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnGpu
(
true
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#endif
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/gather_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() {
VarHandle
*
out_var_handle
;
{
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
outputs_
);
auto
out_var_handles
=
DynamicCast
<
VarHandle
>
(
this
->
Outputs
()
);
PADDLE_ENFORCE_EQ
(
out_var_handles
.
size
(),
1
,
"The number of output should be one."
);
out_var_handle
=
out_var_handles
.
front
();
...
...
@@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() {
Tensor
*
out_tensor
=
out_value
->
mutable_value
();
// copy
auto
dev_ctx
=
dev_ctxes_
[
out_var_handle
->
place_
]
;
auto
dev_ctx
=
dev_ctxes_
.
at
(
out_var_handle
->
place_
)
;
RunAndRecordEvent
(
out_var_handle
->
place_
,
[
in_tensors
,
out_tensor
,
&
dev_ctx
,
t_out_p
]
{
int
s
=
0
,
e
=
0
;
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
a412826c
...
...
@@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void
OpHandleBase
::
WaitInputVarGenerated
(
const
platform
::
Place
&
place
)
{
for
(
auto
*
in
:
inputs_
)
{
if
(
NeedWait
(
in
))
{
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
[
place
]
);
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
.
at
(
place
)
);
}
}
}
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -27,7 +27,7 @@ namespace framework {
namespace
details
{
void
ReduceOpHandle
::
RunImpl
()
{
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
begin
()
->
second
);
platform
::
RecordEvent
record_event
(
Name
(),
dev_ctxes_
.
c
begin
()
->
second
);
if
(
places_
.
size
()
==
1
)
return
;
// the input and output may have dummy var.
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
a412826c
...
...
@@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase {
nccl_ctxs_
(
nccl_ctxs
)
{
if
(
nccl_ctxs_
)
{
for
(
auto
&
p_ctx
:
nccl_ctxs_
->
contexts_
)
{
dev_ctxes_
[
platform
::
CUDAPlace
(
p_ctx
.
first
)]
=
p_ctx
.
second
.
ctx_
.
get
();
this
->
SetDeviceContext
(
platform
::
CUDAPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
...
...
paddle/fluid/framework/details/rpc_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() {
continue
;
}
if
(
in
->
GeneratedOp
())
{
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
[
p
]
);
in
->
GeneratedOp
()
->
RecordWaitEventOnCtx
(
dev_ctxes_
.
at
(
p
)
);
}
}
auto
&
tmp_scope
=
local_scope_
->
FindVar
(
kLocalExecScopeName
)
->
Get
<
Scope
*>
();
...
...
paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
浏览文件 @
a412826c
...
...
@@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
coeff_
(
static_cast
<
float
>
(
1.0
/
num_dev
)),
scope_
(
scope
),
place_
(
place
)
{
dev_ctxes_
[
place_
]
=
dev_ctx
;
this
->
SetDeviceContext
(
place_
,
dev_ctx
)
;
}
ScaleLossGradOpHandle
::~
ScaleLossGradOpHandle
()
{}
...
...
@@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() {
}
else
{
#ifdef PADDLE_WITH_CUDA
this
->
RunAndRecordEvent
([
&
]
{
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctxes_
[
place_
]
)
->
stream
();
auto
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
this
->
dev_ctxes_
.
at
(
place_
)
)
->
stream
();
memory
::
Copy
(
boost
::
get
<
platform
::
CUDAPlace
>
(
place_
),
tmp
,
platform
::
CPUPlace
(),
&
coeff_
,
sizeof
(
float
),
stream
);
VLOG
(
10
)
<<
place_
<<
"RUN Scale loss grad op"
;
...
...
paddle/fluid/framework/details/sequential_execution_pass.cc
0 → 100644
浏览文件 @
a412826c
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
static
bool
IsSameOpDesc
(
OpDesc
*
op1
,
OpDesc
*
op2
)
{
return
op1
->
Type
()
==
op2
->
Type
()
&&
op1
->
Inputs
()
==
op2
->
Inputs
()
&&
op1
->
Outputs
()
==
op2
->
Outputs
();
}
std
::
unique_ptr
<
ir
::
Graph
>
SequentialExecutionPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
// FIXME(zjl): Insert dependencies between some distributed ops may cause
// the multi_devices_graph_pass fails. So we skip these ops here.
// Indeed, maybe we should not insert dependencies between these ops
// casually, which may cause deadlock easily.
// We should add more skipped distributed ops when found errors in
// multi_devices_graph_pass
static
std
::
unordered_set
<
std
::
string
>
skip_dist_ops
{
"send"
,
"recv"
,
"send_barrier"
,
"fetch_barrier"
};
auto
&
ops
=
Get
<
const
std
::
vector
<
OpDesc
*>>
(
kAllOpDescs
);
std
::
vector
<
ir
::
Node
*>
op_node_list
;
op_node_list
.
reserve
(
ops
.
size
());
std
::
unordered_map
<
ir
::
Node
*
,
size_t
>
op_deps
;
std
::
unordered_map
<
ir
::
Node
*
,
std
::
unordered_set
<
ir
::
Node
*>>
pending_ops
;
std
::
unordered_set
<
ir
::
Node
*>
ready_ops
;
for
(
ir
::
Node
*
node
:
graph
->
Nodes
())
{
if
(
!
node
->
IsOp
())
continue
;
std
::
unordered_set
<
ir
::
Node
*>
preceding_ops
;
for
(
auto
*
in
:
node
->
inputs
)
{
PADDLE_ENFORCE
(
in
->
IsVar
(),
"Preceding Node of Op Nodes must be Var Node"
);
if
(
in
->
inputs
.
empty
())
continue
;
PADDLE_ENFORCE
(
in
->
inputs
.
size
()
==
1
&&
in
->
inputs
[
0
]
->
IsOp
(),
"Preceding Op Node of Var Node must be unique"
);
preceding_ops
.
insert
(
in
->
inputs
[
0
]);
pending_ops
[
in
->
inputs
[
0
]].
insert
(
node
);
}
op_deps
[
node
]
=
preceding_ops
.
size
();
if
(
preceding_ops
.
empty
())
{
ready_ops
.
insert
(
node
);
}
}
for
(
auto
*
op_desc
:
ops
)
{
ir
::
Node
*
found_node
=
nullptr
;
for
(
auto
*
node
:
ready_ops
)
{
if
(
IsSameOpDesc
(
op_desc
,
node
->
Op
()))
{
PADDLE_ENFORCE
(
found_node
==
nullptr
,
"Found multiple op_desc in graph: %s"
,
op_desc
->
Type
());
found_node
=
node
;
}
}
PADDLE_ENFORCE_NOT_NULL
(
found_node
,
"Cannot find op_desc in graph: %s"
,
op_desc
->
Type
());
for
(
auto
*
pending_op
:
pending_ops
[
found_node
])
{
if
(
--
op_deps
.
at
(
pending_op
)
==
0
)
{
ready_ops
.
insert
(
pending_op
);
}
}
ready_ops
.
erase
(
found_node
);
if
(
skip_dist_ops
.
count
(
op_desc
->
Type
())
==
0
)
{
op_node_list
.
push_back
(
found_node
);
}
}
for
(
size_t
i
=
1
;
i
<
op_node_list
.
size
();
++
i
)
{
auto
*
dep_var
=
graph
->
CreateControlDepVar
();
op_node_list
[
i
]
->
inputs
.
push_back
(
dep_var
);
op_node_list
[
i
-
1
]
->
outputs
.
push_back
(
dep_var
);
dep_var
->
outputs
.
push_back
(
op_node_list
[
i
]);
dep_var
->
inputs
.
push_back
(
op_node_list
[
i
-
1
]);
VLOG
(
10
)
<<
"Add dependencies between "
<<
op_node_list
[
i
-
1
]
->
Name
()
<<
" and "
<<
op_node_list
[
i
]
->
Name
();
}
return
graph
;
}
}
// namespace details
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
sequential_execution_pass
,
paddle
::
framework
::
details
::
SequentialExecutionPass
)
.
RequirePassAttr
(
paddle
::
framework
::
details
::
kAllOpDescs
);
paddle/fluid/framework/details/sequential_execution_pass.h
0 → 100644
浏览文件 @
a412826c
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
constexpr
char
kAllOpDescs
[]
=
"all_op_descs"
;
class
SequentialExecutionPass
:
public
ir
::
Pass
{
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
浏览文件 @
a412826c
...
...
@@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
new
platform
::
RecordEvent
(
"ThreadedSSAGraphExecutorPrepare"
,
nullptr
));
std
::
unordered_map
<
OpHandleBase
*
,
size_t
>
pending_ops
;
std
::
unordered_set
<
VarHandleBase
*>
pending_vars
;
BlockingQueue
<
VarHandleBase
*>
ready_vars
;
auto
ready_vars
=
std
::
make_shared
<
BlockingQueue
<
VarHandleBase
*>>
()
;
std
::
unordered_set
<
OpHandleBase
*>
ready_ops
;
// For ops (e.g. nccl_all_reduce) that need to coordinate multiple
// streams from multiple GPUs, it's faster to buffer them and schedule
...
...
@@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
for
(
auto
&
var_map
:
graph_
->
Get
<
details
::
GraphVars
>
(
details
::
kGraphVars
))
{
for
(
auto
&
name_pair
:
var_map
)
{
for
(
auto
&
version_pair
:
name_pair
.
second
)
{
InsertPendingVar
(
&
pending_vars
,
&
ready_vars
,
version_pair
.
get
());
InsertPendingVar
(
&
pending_vars
,
ready_vars
.
get
()
,
version_pair
.
get
());
}
}
}
for
(
auto
&
var
:
graph_
->
Get
<
details
::
GraphDepVars
>
(
details
::
kGraphDepVars
))
{
InsertPendingVar
(
&
pending_vars
,
&
ready_vars
,
var
.
get
());
InsertPendingVar
(
&
pending_vars
,
ready_vars
.
get
()
,
var
.
get
());
}
for
(
auto
&
op
:
graph_
->
Get
<
details
::
GraphOps
>
(
details
::
kGraphOps
))
{
...
...
@@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
FeedFetchList
fetch_data
(
fetch_tensors
.
size
());
InsertFetchOps
(
fetch_tensors
,
&
fetch_ops
,
&
fetch_dependencies
,
&
pending_ops
,
&
pending_vars
,
&
ready_vars
,
&
fetch_data
);
&
pending_vars
,
ready_vars
.
get
()
,
&
fetch_data
);
auto
run_all_ops
=
[
&
](
std
::
unordered_set
<
OpHandleBase
*>
&
set
)
{
for
(
auto
*
op
:
set
)
{
running_ops_
++
;
RunOp
(
&
ready_vars
,
op
);
RunOp
(
ready_vars
,
op
);
}
set
.
clear
();
};
...
...
@@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
run_op_futures_
.
clear
();
exception_holder_
.
Clear
();
event
.
reset
(
nullptr
);
// Step 3. Execution
while
(
!
pending_vars
.
empty
())
{
// 1. Run All Ready ops
...
...
@@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// 2. Find ready variable
bool
timeout
;
auto
cur_ready_vars
=
ready_vars
.
PopAll
(
1
,
&
timeout
);
auto
cur_ready_vars
=
ready_vars
->
PopAll
(
1
,
&
timeout
);
if
(
timeout
)
{
if
(
exception_holder_
.
IsCaught
())
{
...
...
@@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
}
}
PADDLE_ENFORCE
(
ready_ops
.
empty
());
// Wait FetchOps.
ClearFetchOp
(
graph_
.
get
(),
&
fetch_ops
);
...
...
@@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
}
void
ThreadedSSAGraphExecutor
::
RunOp
(
BlockingQueue
<
VarHandleBase
*>
*
ready_var_q
,
details
::
OpHandleBase
*
op
)
{
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
details
::
OpHandleBase
*
op
)
{
auto
op_run
=
[
ready_var_q
,
op
,
this
]
{
try
{
if
(
VLOG_IS_ON
(
10
))
{
...
...
paddle/fluid/framework/details/threaded_ssa_graph_executor.h
浏览文件 @
a412826c
...
...
@@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
~
ThreadedSSAGraphExecutor
()
{}
private:
void
RunOp
(
BlockingQueue
<
VarHandleBase
*>
*
ready_var_q
,
void
RunOp
(
const
std
::
shared_ptr
<
BlockingQueue
<
VarHandleBase
*>>
&
ready_var_q
,
details
::
OpHandleBase
*
op
);
private:
...
...
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -41,6 +41,7 @@ pass_library(conv_bn_fuse_pass inference)
pass_library
(
seqconv_eltadd_relu_fuse_pass inference
)
if
(
WITH_MKLDNN
)
pass_library
(
mkldnn_placement_pass base
)
pass_library
(
depthwise_conv_mkldnn_pass base
)
pass_library
(
conv_bias_mkldnn_fuse_pass inference
)
pass_library
(
conv_relu_mkldnn_fuse_pass inference
)
pass_library
(
conv_elementwise_add_mkldnn_fuse_pass inference
)
...
...
@@ -59,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph
cc_test
(
test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector
)
cc_test
(
test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto
)
if
(
WITH_MKLDNN
)
cc_test
(
test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass
)
cc_test
(
test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass
)
cc_test
(
test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass
)
endif
()
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h
浏览文件 @
a412826c
...
...
@@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase {
virtual
~
ConvReLUFusePass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
;
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace ir
...
...
paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
浏览文件 @
a412826c
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
op
->
SetInput
(
"X"
,
inputs
);
}
op
->
SetOutput
(
"Out"
,
outputs
);
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
// a->OP0->b
...
...
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
#define GET_NODE(id, pattern) \
PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
"pattern has no Node called %s", #id); \
auto* id = subgraph.at(pattern.RetrieveNode(#id)); \
PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
std
::
unique_ptr
<
ir
::
Graph
>
DepthwiseConvMKLDNNPass
::
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
{
PADDLE_ENFORCE
(
graph
.
get
());
FusePassBase
::
Init
(
"depthwise_conv_mkldnn_pass"
,
graph
.
get
());
GraphPatternDetector
gpd
;
auto
*
pattern
=
gpd
.
mutable_pattern
();
pattern
->
NewNode
(
"depthwise_conv"
)
->
assert_is_op
(
"depthwise_conv2d"
)
->
assert_op_attr
(
"use_mkldnn"
,
true
);
int
found_depthwise_conv_mkldnn_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
Graph
*
g
)
{
VLOG
(
3
)
<<
"handle DepthwiseConvMKLDNN fuse"
;
GET_NODE
(
depthwise_conv
,
(
*
pattern
));
depthwise_conv
->
Op
()
->
SetType
(
"conv2d"
);
found_depthwise_conv_mkldnn_count
++
;
};
gpd
(
graph
.
get
(),
handler
);
AddStatis
(
found_depthwise_conv_mkldnn_count
);
return
graph
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
REGISTER_PASS
(
depthwise_conv_mkldnn_pass
,
paddle
::
framework
::
ir
::
DepthwiseConvMKLDNNPass
);
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
class
DepthwiseConvMKLDNNPass
:
public
FusePassBase
{
public:
virtual
~
DepthwiseConvMKLDNNPass
()
{}
protected:
std
::
unique_ptr
<
ir
::
Graph
>
ApplyImpl
(
std
::
unique_ptr
<
ir
::
Graph
>
graph
)
const
override
;
};
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc
0 → 100644
浏览文件 @
a412826c
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h"
#include <gtest/gtest.h>
namespace
paddle
{
namespace
framework
{
namespace
ir
{
void
SetOp
(
ProgramDesc
*
prog
,
const
std
::
string
&
type
,
const
std
::
string
&
name
,
const
std
::
vector
<
std
::
string
>&
inputs
,
const
std
::
vector
<
std
::
string
>&
outputs
,
bool
use_mkldnn
=
false
)
{
auto
*
op
=
prog
->
MutableBlock
(
0
)
->
AppendOp
();
op
->
SetType
(
type
);
op
->
SetAttr
(
"use_mkldnn"
,
use_mkldnn
);
op
->
SetAttr
(
"name"
,
name
);
op
->
SetInput
(
"Input"
,
{
inputs
[
0
]});
op
->
SetInput
(
"Filter"
,
{
inputs
[
1
]});
op
->
SetInput
(
"Bias"
,
{
inputs
[
2
]});
op
->
SetOutput
(
"Out"
,
outputs
);
}
// (a, weights, bias)->depthwise conv mkldnn->b
// (b, weights2, bias2)->depthwise conv no mkldnn->c
// (c, weights3, bias3)->conv mkldnn->d
// (d, weights3, bias3)->conv no mkldnn->e
ProgramDesc
BuildProgramDesc
()
{
ProgramDesc
prog
;
for
(
auto
&
v
:
std
::
vector
<
std
::
string
>
(
{
"a"
,
"b"
,
"c"
,
"d"
,
"e"
,
"weights"
,
"bias"
,
"weights2"
,
"bias2"
,
"weights3"
,
"bias3"
,
"weights4"
,
"bias4"
}))
{
auto
*
var
=
prog
.
MutableBlock
(
0
)
->
Var
(
v
);
var
->
SetType
(
proto
::
VarType
::
SELECTED_ROWS
);
if
(
v
==
"weights"
||
v
==
"bias"
||
v
==
"weights2"
||
v
==
"bias2"
||
v
==
"weights3"
||
v
==
"bias3"
||
v
==
"weights4"
||
v
==
"bias4"
)
{
var
->
SetPersistable
(
true
);
}
}
// depthwise conv with MKL-DNN
SetOp
(
&
prog
,
"depthwise_conv2d"
,
"conv1"
,
std
::
vector
<
std
::
string
>
({
"a"
,
"weights"
,
"bias"
}),
std
::
vector
<
std
::
string
>
({
"b"
}),
true
);
// depthwise conv without MKL-DNN
SetOp
(
&
prog
,
"depthwise_conv2d"
,
"conv2"
,
std
::
vector
<
std
::
string
>
({
"b"
,
"weights2"
,
"bias2"
}),
std
::
vector
<
std
::
string
>
({
"c"
}),
false
);
// conv with MKL-DNN
SetOp
(
&
prog
,
"conv2d"
,
"conv3"
,
std
::
vector
<
std
::
string
>
({
"c"
,
"weights3"
,
"bias3"
}),
std
::
vector
<
std
::
string
>
({
"d"
}),
true
);
// conv without MKL-dNN
SetOp
(
&
prog
,
"conv2d"
,
"conv4"
,
std
::
vector
<
std
::
string
>
({
"d"
,
"weights4"
,
"bias4"
}),
std
::
vector
<
std
::
string
>
({
"e"
}),
false
);
return
prog
;
}
TEST
(
DepthwiseConvMKLDNNPass
,
basic
)
{
auto
prog
=
BuildProgramDesc
();
std
::
unique_ptr
<
ir
::
Graph
>
graph
(
new
ir
::
Graph
(
prog
));
auto
pass
=
PassRegistry
::
Instance
().
Get
(
"depthwise_conv_mkldnn_pass"
);
struct
counters
{
int
mkldnn_depthwise_conv_nodes
;
int
other_depthwise_conv_nodes
;
int
mkldnn_conv_nodes
;
int
other_conv_nodes
;
};
counters
before
{
1
,
1
,
1
,
1
};
graph
=
pass
->
Apply
(
std
::
move
(
graph
));
// initialize counters before loop
counters
after
{
0
,
0
,
0
,
0
};
for
(
auto
*
node
:
graph
->
Nodes
())
{
if
(
node
->
IsOp
())
{
auto
*
op
=
node
->
Op
();
if
(
op
->
Type
()
==
"conv2d"
)
{
if
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"use_mkldnn"
)))
after
.
mkldnn_conv_nodes
++
;
else
after
.
other_conv_nodes
++
;
}
else
if
(
op
->
Type
()
==
"depthwise_conv2d"
)
{
if
(
boost
::
get
<
bool
>
(
op
->
GetAttr
(
"use_mkldnn"
)))
after
.
mkldnn_depthwise_conv_nodes
++
;
else
after
.
other_depthwise_conv_nodes
++
;
}
}
}
EXPECT_EQ
(
after
.
other_depthwise_conv_nodes
,
before
.
other_depthwise_conv_nodes
);
EXPECT_EQ
(
after
.
other_conv_nodes
,
before
.
other_conv_nodes
);
EXPECT_EQ
(
after
.
mkldnn_depthwise_conv_nodes
,
before
.
mkldnn_depthwise_conv_nodes
-
1
);
EXPECT_EQ
(
after
.
mkldnn_conv_nodes
,
before
.
mkldnn_conv_nodes
+
1
);
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
USE_PASS
(
depthwise_conv_mkldnn_pass
);
paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
浏览文件 @
a412826c
...
...
@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace
paddle
{
namespace
framework
{
...
...
@@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type,
op
->
SetInput
(
"X"
,
inputs
);
}
op
->
SetOutput
(
"Out"
,
outputs
);
op
->
SetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
OpRole
::
kForward
));
}
// a->OP0->b
...
...
paddle/fluid/framework/ir/graph.cc
浏览文件 @
a412826c
...
...
@@ -23,8 +23,67 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
namespace
ir
{
namespace
{
void
CheckProgram
(
const
ProgramDesc
&
program
)
{
#define _INT(role) static_cast<int>(role)
std
::
map
<
int
,
bool
>
visit
;
for
(
OpDesc
*
op
:
program
.
Block
(
0
).
AllOps
())
{
// For backward compatibility, some program doesn't have role added.
if
(
!
op
->
HasAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()))
continue
;
int
role_id
=
boost
::
get
<
int
>
(
op
->
GetAttr
(
OpProtoAndCheckerMaker
::
OpRoleAttrName
()));
visit
[
role_id
]
=
true
;
switch
(
role_id
)
{
case
_INT
(
OpRole
::
kForward
):
if
(
visit
.
find
(
_INT
(
OpRole
::
kBackward
))
!=
visit
.
end
())
{
LOG
(
ERROR
)
<<
"Cannot add backward operator before forward operator %s."
<<
op
->
Type
();
}
break
;
case
_INT
(
OpRole
::
kBackward
):
case
_INT
(
OpRole
::
kBackward
)
|
_INT
(
OpRole
::
kLoss
):
PADDLE_ENFORCE
(
visit
.
find
(
_INT
(
OpRole
::
kOptimize
))
==
visit
.
end
(),
"Cannot add backward operator %s after optimize operator."
,
op
->
Type
());
break
;
case
_INT
(
OpRole
::
kForward
)
|
_INT
(
OpRole
::
kLoss
):
PADDLE_ENFORCE
(
visit
.
find
(
_INT
(
OpRole
::
kBackward
)
|
_INT
(
OpRole
::
kLoss
))
==
visit
.
end
(),
"Cannot add backward|loss operator before "
"forward|loss operator %s."
,
op
->
Type
());
PADDLE_ENFORCE
(
visit
.
find
(
_INT
(
OpRole
::
kOptimize
))
==
visit
.
end
(),
"Cannot add forward|loss operator %s after optimize operator."
,
op
->
Type
());
break
;
case
_INT
(
OpRole
::
kOptimize
):
case
_INT
(
OpRole
::
kOptimize
)
|
_INT
(
OpRole
::
kLRSched
):
PADDLE_ENFORCE
(
visit
.
find
(
_INT
(
OpRole
::
kBackward
))
!=
visit
.
end
(),
"Optimize operators %s must follow backward operator."
,
op
->
Type
());
break
;
case
_INT
(
OpRole
::
kLRSched
):
case
_INT
(
OpRole
::
kDist
):
case
_INT
(
OpRole
::
kRPC
):
case
_INT
(
OpRole
::
kNotSpecified
):
break
;
default:
LOG
(
FATAL
)
<<
"Unknown operator role. Don't add new role because "
"you don't know what you are doing."
;
}
}
#undef _INT
}
}
// namespace
Graph
::
Graph
(
const
ProgramDesc
&
program
)
:
program_
(
program
)
{
CheckProgram
(
program_
);
// Make the nodes id start from 0.
Node
::
ResetId
();
auto
var_nodes
=
InitFromProgram
(
program_
);
...
...
paddle/fluid/framework/ir/graph_pattern_detector.cc
浏览文件 @
a412826c
...
...
@@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() {
return
result
;
}
bool
GraphItemCMP
(
const
std
::
pair
<
PDNode
*
,
Node
*>
&
a
,
const
std
::
pair
<
PDNode
*
,
Node
*>
&
b
)
{
if
(
a
.
first
!=
b
.
first
)
{
return
a
.
first
<
b
.
first
;
}
else
{
return
a
.
second
<
b
.
second
;
}
}
// TODO(Superjomn) enhance the function as it marks unique unique as duplicates
// see https://github.com/PaddlePaddle/Paddle/issues/13550
void
GraphPatternDetector
::
UniquePatterns
(
...
...
@@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns(
std
::
vector
<
GraphPatternDetector
::
subgraph_t
>
result
;
std
::
unordered_set
<
size_t
>
set
;
std
::
hash
<
std
::
string
>
hasher
;
for
(
auto
&
g
:
*
subgraphs
)
{
size_t
key
=
0
;
for
(
auto
&
item
:
g
)
{
key
^=
std
::
hash
<
void
*>
{}(
item
.
first
);
key
^=
std
::
hash
<
void
*>
{}(
item
.
second
);
// Sort the items in the sub-graph, and transform to a string key.
std
::
vector
<
std
::
pair
<
PDNode
*
,
Node
*>>
sorted_keys
(
g
.
begin
(),
g
.
end
());
std
::
sort
(
sorted_keys
.
begin
(),
sorted_keys
.
end
(),
GraphItemCMP
);
std
::
stringstream
ss
;
for
(
auto
&
item
:
sorted_keys
)
{
ss
<<
item
.
first
<<
":"
<<
item
.
second
;
}
auto
key
=
hasher
(
ss
.
str
());
if
(
!
set
.
count
(
key
))
{
result
.
emplace_back
(
g
);
set
.
insert
(
key
);
...
...
paddle/fluid/framework/lod_tensor.cc
浏览文件 @
a412826c
...
...
@@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor(
PADDLE_ENFORCE_EQ
(
new_lod
.
size
(),
lod
.
size
());
for
(
size_t
j
=
0
;
j
<
lod
.
size
();
++
j
)
{
auto
&
sub_lod
=
new_lod
[
j
];
auto
&
offset
=
sub_lod
.
back
();
size_t
offset
=
sub_lod
.
back
();
for
(
size_t
k
=
1
;
k
<
lod
[
j
].
size
();
++
k
)
{
sub_lod
.
push_back
(
lod
[
j
][
k
]
+
offset
);
}
...
...
paddle/fluid/framework/lod_tensor_array.h
浏览文件 @
a412826c
...
...
@@ -19,81 +19,7 @@ limitations under the License. */
namespace
paddle
{
namespace
framework
{
// NOTE The vector<LoDTensor> can't be replaced with the class LoDTensorArray
// directly, because there are many vector<LoDTensor> used accross the project,
// and some of them are treated as LoDTensorArray.
#if !defined(PADDLE_ON_INFERENCE)
using
LoDTensorArray
=
std
::
vector
<
LoDTensor
>
;
#else // !PADDLE_ON_INFERENCE
#pragma message "LoDTensorArray is replaced with the inference one."
/*
* A LoDTensorArray which will not deallocate buffer when resized, fix the data
* diff in inference, and more performance friendly in the concurrency
* scenerios.
*/
class
LoDTensorArray
{
public:
LoDTensorArray
()
=
default
;
using
iterator
=
std
::
vector
<
LoDTensor
>::
iterator
;
using
const_iterator
=
std
::
vector
<
LoDTensor
>::
const_iterator
;
const_iterator
begin
()
const
{
return
array_
.
begin
();
}
const_iterator
end
()
const
{
return
array_
.
begin
()
+
size_
;
}
iterator
begin
()
{
return
array_
.
begin
();
}
iterator
end
()
{
return
array_
.
begin
()
+
size_
;
}
void
push_back
(
const
LoDTensor
&
x
)
{
if
(
size_
<
array_
.
size
())
{
array_
[
size_
++
]
=
x
;
}
else
{
array_
.
push_back
(
x
);
++
size_
;
}
}
void
resize
(
size_t
size
)
{
if
(
array_
.
size
()
<
size
)
{
array_
.
resize
(
size
);
}
size_
=
size
;
}
void
emplace_back
()
{
array_
.
emplace_back
();
}
void
emplace_back
(
LoDTensor
&&
x
)
{
array_
.
emplace_back
(
std
::
move
(
x
));
}
LoDTensor
&
back
()
{
return
array_
.
back
();
}
size_t
space
()
const
{
return
array_
.
size
();
}
void
reserve
(
size_t
size
)
{
// Naive warning to tell user this array might be to large. The memory and
// buffer used by this TensorArray will not be deleted during the training
// and inference phase, so attention not to make it expand too long.
if
(
size
>
800UL
)
{
LOG
(
WARNING
)
<<
"TensorArray has more than 800 items"
;
}
array_
.
reserve
(
size
);
}
bool
empty
()
const
{
return
size_
==
0UL
;
}
void
clear
()
{
size_
=
0UL
;
}
LoDTensor
&
operator
[](
size_t
id
)
{
return
array_
[
id
];
}
const
LoDTensor
&
operator
[](
size_t
id
)
const
{
return
array_
[
id
];
}
LoDTensor
&
at
(
size_t
id
)
{
return
array_
.
at
(
id
);
}
const
LoDTensor
&
at
(
size_t
id
)
const
{
return
array_
.
at
(
id
);
}
size_t
size
()
const
{
return
size_
;
}
private:
size_t
size_
{
0
};
std
::
vector
<
LoDTensor
>
array_
;
};
#endif // !PADDLE_ON_INFERENCE
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/operator.cc
浏览文件 @
a412826c
...
...
@@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() {
}
}
static
bool
VarIsTensor
(
const
Variable
*
var
)
{
return
var
->
IsType
<
LoDTensor
>
()
||
var
->
IsType
<
SelectedRows
>
();
static
bool
VarIsTensor
(
const
Variable
&
var
)
{
return
var
.
IsType
<
LoDTensor
>
()
||
var
.
IsType
<
SelectedRows
>
();
}
const
Tensor
*
GetTensorFromVar
(
Variable
*
var
)
{
if
(
var
->
IsType
<
LoDTensor
>
())
{
return
var
->
GetMutable
<
LoDTensor
>
(
);
}
else
if
(
var
->
IsType
<
SelectedRows
>
())
{
return
var
->
GetMutable
<
SelectedRows
>
()
->
mutable_value
(
);
const
Tensor
*
GetTensorFromVar
(
const
Variable
&
var
)
{
if
(
var
.
IsType
<
LoDTensor
>
())
{
return
static_cast
<
const
Tensor
*>
(
&
(
var
.
Get
<
LoDTensor
>
())
);
}
else
if
(
var
.
IsType
<
SelectedRows
>
())
{
return
&
(
var
.
Get
<
SelectedRows
>
().
value
()
);
}
else
{
PADDLE_THROW
(
"Variable type_id %s, expect LoDTensor/SelectedRows."
,
var
->
Type
().
name
());
var
.
Type
().
name
());
}
}
...
...
@@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
template
<
>
const
Tensor
*
ExecutionContext
::
Input
<
Tensor
>
(
const
std
::
string
&
name
)
const
{
auto
*
var
=
InputVar
(
name
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
const_cast
<
Variable
*>
(
var
));
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
*
var
);
}
template
<
>
...
...
@@ -428,7 +427,7 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
std
::
transform
(
names
.
begin
(),
names
.
end
(),
std
::
back_inserter
(
res
),
[
&
](
const
std
::
string
&
sub_name
)
{
auto
var
=
scope_
.
FindVar
(
sub_name
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
var
);
return
var
==
nullptr
?
nullptr
:
GetTensorFromVar
(
*
var
);
});
return
res
;
}
...
...
@@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack(
for
(
auto
&
var_name
:
inplace_vars
)
{
VLOG
(
3
)
<<
"share inplace var "
+
var_name
+
" back to it's original scope"
;
auto
*
original_tensor
=
GetMutableTensorFromVar
(
scope
.
FindVar
(
var_name
));
auto
*
transformed_tensor
=
GetTensorFromVar
(
transfer_scope
.
FindVar
(
var_name
));
auto
*
var
=
transfer_scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE
(
var
!=
nullptr
,
"The var[%s] should not be nullptr"
,
var_name
);
auto
*
transformed_tensor
=
GetTensorFromVar
(
*
var
);
original_tensor
->
ShareDataWith
(
*
transformed_tensor
);
}
}
...
...
@@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData(
for
(
auto
&
var_name
:
var_name_item
.
second
)
{
auto
*
var
=
scope
.
FindVar
(
var_name
);
// Only tensor can be tranfer to another device.
if
(
var
==
nullptr
||
!
VarIsTensor
(
var
))
{
if
(
var
==
nullptr
||
!
VarIsTensor
(
*
var
))
{
continue
;
}
auto
*
tensor_in
=
GetTensorFromVar
(
var
);
auto
*
tensor_in
=
GetTensorFromVar
(
*
var
);
if
(
!
tensor_in
->
IsInitialized
())
{
continue
;
}
...
...
paddle/fluid/framework/operator.h
浏览文件 @
a412826c
...
...
@@ -63,7 +63,7 @@ inline std::string GradVarName(const std::string& var_name) {
}
proto
::
VarType
::
Type
GetDataTypeOfVar
(
const
Variable
*
var
);
const
Tensor
*
GetTensorFromVar
(
Variable
*
var
);
const
Tensor
*
GetTensorFromVar
(
const
Variable
&
var
);
class
OperatorBase
;
class
ExecutionContext
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
a412826c
...
...
@@ -303,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
}
ParallelExecutor
::~
ParallelExecutor
()
{
const
auto
dev_ctxs
=
platform
::
DeviceContextPool
::
Instance
().
GetAllDeviceContexts
();
for
(
auto
&
dev_ctx
:
dev_ctxs
)
{
dev_ctx
->
Wait
();
for
(
auto
&
p
:
member_
->
places_
)
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
)
->
Wait
();
}
if
(
member_
->
own_local_scope_
)
{
...
...
paddle/fluid/framework/tensor_test.cc
浏览文件 @
a412826c
...
...
@@ -75,6 +75,19 @@ TEST(Tensor, MutableData) {
platform
::
CPUPlace
());
EXPECT_EQ
(
p1
,
p2
);
}
// Not sure if it's desired, but currently, Tensor type can be changed.
{
framework
::
Tensor
src_tensor
;
int8_t
*
p1
=
src_tensor
.
mutable_data
<
int8_t
>
(
framework
::
make_ddim
({
1
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p1
,
nullptr
);
*
p1
=
1
;
uint8_t
*
p2
=
src_tensor
.
mutable_data
<
uint8_t
>
(
framework
::
make_ddim
({
1
}),
platform
::
CPUPlace
());
EXPECT_NE
(
p2
,
nullptr
);
EXPECT_EQ
(
static_cast
<
int
>
(
p2
[
0
]),
1
);
}
#ifdef PADDLE_WITH_CUDA
{
...
...
paddle/fluid/framework/tensor_util.cc
浏览文件 @
a412826c
...
...
@@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
auto
src_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_gpu_place
,
src_ptr
,
size
,
nullptr
);
}
else
if
(
platform
::
is_cuda_pinned_place
(
src_place
)
&&
platform
::
is_gpu_place
(
dst_place
))
{
auto
src_pinned_place
=
boost
::
get
<
platform
::
CUDAPinnedPlace
>
(
src_place
);
auto
dst_gpu_place
=
boost
::
get
<
platform
::
CUDAPlace
>
(
dst_place
);
memory
::
Copy
(
dst_gpu_place
,
dst_ptr
,
src_pinned_place
,
src_ptr
,
size
,
nullptr
);
}
#endif
}
...
...
paddle/fluid/inference/CMakeLists.txt
浏览文件 @
a412826c
if
(
WITH_TESTING
)
include
(
test.cmake
)
# some generic cmake funtion for inference
endif
()
# analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library.
add_subdirectory
(
analysis
)
...
...
paddle/fluid/inference/analysis/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis)
cc_test
(
test_dot SRCS dot_tester.cc DEPS analysis
)
cc_binary
(
inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid
)
function
(
inference_analysis_test TARGET
)
if
(
WITH_TESTING
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS ARGS EXTRA_DEPS
)
cmake_parse_arguments
(
analysis_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
mem_opt
""
)
if
(
WITH_GPU
)
set
(
mem_opt
"--fraction_of_gpu_memory_to_use=0.5"
)
endif
()
cc_test
(
${
TARGET
}
SRCS
"
${
analysis_test_SRCS
}
"
DEPS analysis pass
${
GLOB_PASS_LIB
}
${
analysis_test_EXTRA_DEPS
}
ARGS --inference_model_dir=
${
PYTHON_TESTS_DIR
}
/book/word2vec.inference.model
${
mem_opt
}
${
analysis_test_ARGS
}
)
set_tests_properties
(
${
TARGET
}
PROPERTIES DEPENDS test_word2vec
)
endif
(
WITH_TESTING
)
function
(
inference_analysis_test TARGET
)
if
(
WITH_TESTING
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS ARGS EXTRA_DEPS
)
cmake_parse_arguments
(
analysis_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
inference_base_test
(
${
TARGET
}
SRCS
${
analysis_test_SRCS
}
DEPS analysis pass
${
GLOB_PASS_LIB
}
${
analysis_test_EXTRA_DEPS
}
ARGS --inference_model_dir=
${
WORD2VEC_MODEL_DIR
}
${
analysis_test_ARGS
}
)
endif
()
endfunction
(
inference_analysis_test
)
inference_analysis_test
(
test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api
)
...
...
paddle/fluid/inference/analysis/analyzer.h
浏览文件 @
a412826c
...
...
@@ -79,6 +79,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
"conv_bn_fuse_pass"
,
//
"conv_eltwiseadd_bn_fuse_pass"
,
//
#ifdef PADDLE_WITH_MKLDNN
"depthwise_conv_mkldnn_pass"
,
//
"conv_bias_mkldnn_fuse_pass"
,
//
"conv_relu_mkldnn_fuse_pass"
,
//
"conv_elementwise_add_mkldnn_fuse_pass"
,
//
...
...
paddle/fluid/inference/analysis/data_flow_graph_tester.cc
浏览文件 @
a412826c
...
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/analysis/data_flow_graph.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
...
...
@@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type,
op
->
SetType
(
type
);
op
->
SetInput
(
"Xs"
,
inputs
);
op
->
SetOutput
(
"Xs"
,
outputs
);
op
->
SetAttr
(
framework
::
OpProtoAndCheckerMaker
::
OpRoleAttrName
(),
static_cast
<
int
>
(
framework
::
OpRole
::
kForward
));
}
TEST
(
DataFlowGraph
,
Build_IR_Graph
)
{
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -17,39 +17,12 @@ if(APPLE)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-Wno-error=pessimizing-move"
)
endif
(
APPLE
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
set
(
inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor
${
GLOB_PASS_LIB
}
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
set
(
inference_deps
${
inference_deps
}
paddle_inference_tensorrt_subgraph_engine analysis_predictor
)
endif
()
function
(
inference_api_test TARGET_NAME
)
if
(
WITH_TESTING
)
set
(
options
""
)
set
(
oneValueArgs SRC
)
set
(
multiValueArgs ARGS
)
cmake_parse_arguments
(
inference_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
WITH_GPU
)
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/ --fraction_of_gpu_memory_to_use=0.15
)
else
()
cc_test
(
${
TARGET_NAME
}
SRCS
${
inference_test_SRC
}
DEPS
"
${
inference_deps
}
"
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book/
)
endif
()
if
(
inference_test_ARGS
)
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES DEPENDS
"
${
inference_test_ARGS
}
"
)
endif
()
endif
(
WITH_TESTING
)
endfunction
(
inference_api_test
)
cc_library
(
reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope
)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope
)
cc_library
(
analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor
)
...
...
@@ -59,8 +32,11 @@ cc_test(test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api
)
inference_api_test
(
test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification
)
if
(
WITH_TESTING
)
inference_base_test
(
test_api_impl SRCS api_impl_tester.cc DEPS
${
inference_deps
}
ARGS --word2vec_dirname=
${
WORD2VEC_MODEL_DIR
}
--book_dirname=
${
PYTHON_TESTS_DIR
}
/book
)
set_tests_properties
(
test_api_impl PROPERTIES DEPENDS test_image_classification
)
endif
()
cc_test
(
test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor
${
inference_deps
}
paddle_inference_api
ARGS --dirname=
${
PYTHON_TESTS_DIR
}
/book
)
...
...
@@ -68,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
cc_library
(
paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy
)
inference_api_test
(
test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec
)
if
(
WITH_TESTING
)
inference_base_test
(
test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS
${
inference_deps
}
ARGS --dirname=
${
WORD2VEC_MODEL_DIR
}
)
endif
()
endif
()
if
(
WITH_ANAKIN AND WITH_MKL
)
# only needed in CI
...
...
paddle/fluid/inference/api/api_impl_tester.cc
浏览文件 @
a412826c
...
...
@@ -22,12 +22,14 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"
#ifdef __clang__
#define ACC_DIFF 4e-
2
#define ACC_DIFF 4e-
3
#else
#define ACC_DIFF 1e-
2
#define ACC_DIFF 1e-
3
#endif
DEFINE_string
(
dirname
,
""
,
"Directory of the inference model."
);
DEFINE_string
(
word2vec_dirname
,
""
,
"Directory of the word2vec inference model."
);
DEFINE_string
(
book_dirname
,
""
,
"Directory of the book inference model."
);
namespace
paddle
{
...
...
@@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
NativeConfig
GetConfig
()
{
NativeConfig
config
;
config
.
model_dir
=
FLAGS_
dirname
+
"/word2vec.inference.model"
;
config
.
model_dir
=
FLAGS_
word2vec_dirname
;
LOG
(
INFO
)
<<
"dirname "
<<
config
.
model_dir
;
config
.
fraction_of_gpu_memory
=
0.15
;
#ifdef PADDLE_WITH_CUDA
...
...
@@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"/image_classification_resnet.inference.model"
;
FLAGS_
book_
dirname
+
"/image_classification_resnet.inference.model"
;
const
bool
is_combined
=
false
;
std
::
vector
<
std
::
vector
<
int64_t
>>
feed_target_shapes
=
...
...
@@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) {
NativeConfig
config
=
GetConfig
();
config
.
use_gpu
=
use_gpu
;
config
.
model_dir
=
FLAGS_dirname
+
"/image_classification_resnet.inference.model"
;
FLAGS_
book_
dirname
+
"/image_classification_resnet.inference.model"
;
auto
main_predictor
=
CreatePaddlePredictor
<
NativeConfig
>
(
config
);
std
::
vector
<
framework
::
LoDTensor
>
jobs
(
num_jobs
);
...
...
paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
浏览文件 @
a412826c
...
...
@@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
//# 1. Create PaddlePredictor with a config.
NativeConfig
config0
;
config0
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config0
.
model_dir
=
FLAGS_dirname
;
config0
.
use_gpu
=
true
;
config0
.
fraction_of_gpu_memory
=
0.3
;
config0
.
device
=
0
;
MixedRTConfig
config1
;
config1
.
model_dir
=
FLAGS_dirname
+
"word2vec.inference.model"
;
config1
.
model_dir
=
FLAGS_dirname
;
config1
.
use_gpu
=
true
;
config1
.
fraction_of_gpu_memory
=
0.3
;
config1
.
device
=
0
;
...
...
paddle/fluid/inference/api/demo_ci/run.sh
浏览文件 @
a412826c
...
...
@@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do
-DWITH_GPU
=
$TEST_GPU_CPU
\
-DWITH_STATIC_LIB
=
$WITH_STATIC_LIB
make
-j
word2vec_model
=
$
{
PADDLE_ROOT
}
'/build/python/paddle/fluid/tests/book
/word2vec.inference.model'
word2vec_model
=
$
DATA_DIR
'/word2vec
/word2vec.inference.model'
if
[
-d
$word2vec_model
]
;
then
for
use_gpu
in
$use_gpu_list
;
do
./simple_on_word2vec
\
...
...
paddle/fluid/inference/test.cmake
0 → 100644
浏览文件 @
a412826c
set
(
INFERENCE_URL
"http://paddle-inference-dist.cdn.bcebos.com"
CACHE STRING
"inference download url"
)
set
(
INFERENCE_DEMO_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/inference_demo"
CACHE STRING
"A path setting inference demo download directories."
)
function
(
inference_download install_dir url filename
)
message
(
STATUS
"Download inference test stuff from
${
url
}
/
${
filename
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
install_dir
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& wget -q
${
url
}
/
${
filename
}
"
)
message
(
STATUS
"finish downloading
${
filename
}
"
)
endfunction
()
function
(
inference_download_and_uncompress install_dir url filename
)
inference_download
(
${
install_dir
}
${
url
}
${
filename
}
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& tar xzf
${
filename
}
"
)
endfunction
()
set
(
WORD2VEC_INSTALL_DIR
"
${
INFERENCE_DEMO_INSTALL_DIR
}
/word2vec"
)
if
(
NOT EXISTS
${
WORD2VEC_INSTALL_DIR
}
)
inference_download_and_uncompress
(
${
WORD2VEC_INSTALL_DIR
}
${
INFERENCE_URL
}
"word2vec.inference.model.tar.gz"
)
endif
()
set
(
WORD2VEC_MODEL_DIR
"
${
WORD2VEC_INSTALL_DIR
}
/word2vec.inference.model"
)
function
(
inference_base_test TARGET
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS ARGS DEPS
)
cmake_parse_arguments
(
base_test
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
WITH_GPU
)
set
(
mem_opt
"--fraction_of_gpu_memory_to_use=0.5"
)
endif
()
cc_test
(
${
TARGET
}
SRCS
${
base_test_SRCS
}
DEPS
${
base_test_DEPS
}
ARGS
${
mem_opt
}
${
base_test_ARGS
}
)
endfunction
()
paddle/fluid/inference/tests/api/CMakeLists.txt
浏览文件 @
a412826c
set
(
INFERENCE_URL
"http://paddle-inference-dist.cdn.bcebos.com"
)
set
(
INFERENCE_DEMO_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/inference_demo"
CACHE STRING
"A path setting inference demo download directories."
)
set
(
INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
)
function
(
inference_download install_dir url filename
)
message
(
STATUS
"Download inference test stuff from
${
url
}
/
${
filename
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
install_dir
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& wget -q
${
url
}
/
${
filename
}
"
)
message
(
STATUS
"finish downloading
${
filename
}
"
)
endfunction
()
function
(
inference_download_and_uncompress install_dir url filename
)
inference_download
(
${
install_dir
}
${
url
}
${
filename
}
)
execute_process
(
COMMAND bash -c
"cd
${
install_dir
}
&& tar xzf
${
filename
}
"
)
endfunction
()
function
(
download_model_and_data install_dir model_name data_name
)
if
(
NOT EXISTS
${
install_dir
}
)
...
...
paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
ScopedSpatialTransformerDescriptor
=
platform
::
ScopedSpatialTransformerDescriptor
;
template
<
typename
T
>
class
CUDNNAffineGridOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
theta
=
ctx
.
Input
<
Tensor
>
(
"Theta"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
const
T
*
theta_data
=
theta
->
data
<
T
>
();
int
n
=
theta
->
dims
()[
0
];
auto
size_attr
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"output_shape"
);
Tensor
h_sizes
;
int
*
h_size_data
;
if
(
size_attr
.
size
()
==
0
)
{
auto
*
output_shape
=
ctx
.
Input
<
Tensor
>
(
"OutputShape"
);
framework
::
TensorCopy
(
*
output_shape
,
platform
::
CPUPlace
(),
&
h_sizes
);
h_size_data
=
h_sizes
.
data
<
int
>
();
}
else
{
h_size_data
=
h_sizes
.
mutable_data
<
int
>
({
4
},
platform
::
CPUPlace
());
h_size_data
[
0
]
=
n
;
h_size_data
[
1
]
=
size_attr
[
1
];
h_size_data
[
2
]
=
size_attr
[
2
];
h_size_data
[
3
]
=
size_attr
[
3
];
}
T
*
output_data
=
output
->
mutable_data
<
T
>
(
{
n
,
h_size_data
[
2
],
h_size_data
[
3
],
2
},
ctx
.
GetPlace
());
ScopedSpatialTransformerDescriptor
st_desc
;
cudnnSpatialTransformerDescriptor_t
cudnn_st_desc
=
st_desc
.
descriptor
<
T
>
(
4
,
h_size_data
);
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfGridGeneratorForward
(
handle
,
cudnn_st_desc
,
theta_data
,
output_data
));
}
};
template
<
typename
T
>
class
CUDNNAffineGridGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace."
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
theta_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Theta"
));
int
n
=
output_grad
->
dims
()[
0
];
auto
size_attr
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"output_shape"
);
Tensor
h_sizes
;
int
*
h_size_data
;
if
(
size_attr
.
size
()
==
0
)
{
auto
*
output_shape
=
ctx
.
Input
<
Tensor
>
(
"OutputShape"
);
framework
::
TensorCopy
(
*
output_shape
,
platform
::
CPUPlace
(),
&
h_sizes
);
h_size_data
=
h_sizes
.
data
<
int
>
();
}
else
{
h_size_data
=
h_sizes
.
mutable_data
<
int
>
({
4
},
platform
::
CPUPlace
());
h_size_data
[
0
]
=
n
;
h_size_data
[
1
]
=
size_attr
[
1
];
h_size_data
[
2
]
=
size_attr
[
2
];
h_size_data
[
3
]
=
size_attr
[
3
];
}
ScopedSpatialTransformerDescriptor
st_desc
;
cudnnSpatialTransformerDescriptor_t
cudnn_st_desc
=
st_desc
.
descriptor
<
T
>
(
4
,
h_size_data
);
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
T
*
theta_grad_data
=
theta_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
PADDLE_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfGridGeneratorBackward
(
handle
,
cudnn_st_desc
,
output_grad_data
,
theta_grad_data
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_KERNEL
(
affine_grid
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNAffineGridOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNAffineGridOpKernel
<
double
>
);
REGISTER_OP_KERNEL
(
affine_grid_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNAffineGridGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNAffineGridGradOpKernel
<
double
>
);
paddle/fluid/operators/affine_grid_op.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/affine_grid_op.h"
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
>
struct
Linspace
<
paddle
::
platform
::
CPUDeviceContext
,
T
>
{
framework
::
Tensor
operator
()(
T
start
,
T
end
,
int
count
,
const
framework
::
ExecutionContext
&
ctx
)
{
Tensor
numbers
;
T
*
number_data
=
numbers
.
mutable_data
<
T
>
({
count
},
platform
::
CPUPlace
());
T
slice
=
(
end
-
start
)
/
(
T
)(
count
-
1
);
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
number_data
[
i
]
=
start
+
(
T
)
i
*
slice
;
}
return
numbers
;
}
};
class
AffineGridOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Theta"
),
"Input(Theta) of AffineGridOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Output"
),
"Output(Output) of AffineGridOp should not be null."
);
auto
theta_dims
=
ctx
->
GetInputDim
(
"Theta"
);
PADDLE_ENFORCE
(
theta_dims
.
size
()
==
3
,
"AffineGrid's Input(Theta) should be 3-D tensor."
);
auto
output_shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"output_shape"
);
if
(
output_shape
.
size
()
==
0
)
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"OutputShape"
),
"Input(OutputShape) of AffineGridOp should not be null if "
"attr(output_shape) is not configured."
);
auto
output_shape_dims
=
ctx
->
GetInputDim
(
"OutputShape"
);
PADDLE_ENFORCE
(
output_shape_dims
.
size
()
==
1
,
"AffineGrid's Input(OutputShape) should be 1-D tensor."
);
}
else
{
PADDLE_ENFORCE
(
output_shape
.
size
()
==
4
,
"The size of attr(output_shape) should be 4."
);
}
PADDLE_ENFORCE
(
theta_dims
[
1
]
==
2
,
"Input(theta) dims[1] should be 2."
);
PADDLE_ENFORCE
(
theta_dims
[
2
]
==
3
,
"Input(theta) dims[2] should be 3."
);
// N * H * W * 2
ctx
->
SetOutputDim
(
"Output"
,
framework
::
make_ddim
({
theta_dims
[
0
],
-
1
,
-
1
,
2
}));
ctx
->
ShareLoD
(
"Theta"
,
"Output"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
auto
data_type
=
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Theta"
)
->
type
());
return
framework
::
OpKernelType
(
data_type
,
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kAnyLayout
,
library
);
}
};
class
AffineGridOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"Theta"
,
"(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
"It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
"y_1)."
);
AddInput
(
"OutputShape"
,
"(Tensor) The shape of target image with format [N, C, H, W]."
)
.
AsDispensable
();
AddOutput
(
"Output"
,
"(Tensor) Output Tensor with shape [N, H, W, 2]."
);
AddAttr
<
bool
>
(
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
true
);
AddAttr
<
std
::
vector
<
int
>>
(
"output_shape"
,
"The target output image shape with format [N, C, H, W]."
)
.
SetDefault
(
std
::
vector
<
int
>
());
AddComment
(
R"DOC(
It generates a grid of (x,y) coordinates using the parameters of the
affine transformation that correspond to a set of points where the input
feature map should be sampled to produce the transformed output feature map.
Given:
Theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
OutputShape = [2, 3, 5, 5]
Step 1:
Generate relative coordinates according to OutputShape.
The values of relative coordinates are in the interval between -1 and 1.
The shape of the relative coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
)DOC"
);
}
};
class
AffineGridOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
theta_dims
=
ctx
->
GetInputDim
(
"Theta"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Theta"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Theta"
),
theta_dims
);
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"Theta"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kAnyLayout
,
library_
);
}
};
class
AffineGridGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"affine_grid_grad"
);
op
->
SetInput
(
"Theta"
,
Input
(
"Theta"
));
op
->
SetInput
(
"OutputShape"
,
Input
(
"OutputShape"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Output"
),
OutputGrad
(
"Output"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"Theta"
),
InputGrad
(
"Theta"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
affine_grid
,
ops
::
AffineGridOp
,
ops
::
AffineGridOpMaker
,
ops
::
AffineGridGradMaker
);
REGISTER_OPERATOR
(
affine_grid_grad
,
ops
::
AffineGridOpGrad
);
REGISTER_OP_CPU_KERNEL
(
affine_grid
,
ops
::
AffineGridOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AffineGridOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
affine_grid_grad
,
ops
::
AffineGridGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
AffineGridGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/affine_grid_op.h
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
Array1
=
Eigen
::
DSizes
<
int64_t
,
1
>
;
using
Array2
=
Eigen
::
DSizes
<
int64_t
,
2
>
;
using
Array3
=
Eigen
::
DSizes
<
int64_t
,
3
>
;
using
Array4
=
Eigen
::
DSizes
<
int64_t
,
4
>
;
/**
*Return a tensor with evenly spaced numbers over a specified interval.
*/
template
<
typename
DeviceContext
,
typename
T
>
struct
Linspace
{
framework
::
Tensor
operator
()(
T
start
,
T
end
,
int
count
,
const
framework
::
ExecutionContext
&
ctx
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
AffineGridOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
*
theta
=
ctx
.
Input
<
Tensor
>
(
"Theta"
);
int
n
=
theta
->
dims
()[
0
];
auto
size_attr
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"output_shape"
);
int
h
=
0
;
int
w
=
0
;
if
(
size_attr
.
size
()
==
0
)
{
auto
*
output_shape
=
ctx
.
Input
<
Tensor
>
(
"OutputShape"
);
Tensor
h_sizes
;
framework
::
TensorCopy
(
*
output_shape
,
platform
::
CPUPlace
(),
&
h_sizes
);
const
int
*
h_size_data
=
h_sizes
.
data
<
int
>
();
h
=
h_size_data
[
2
];
w
=
h_size_data
[
3
];
}
else
{
h
=
size_attr
[
2
];
w
=
size_attr
[
3
];
}
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
({
n
,
h
,
w
,
2
},
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
output
,
static_cast
<
T
>
(
0
));
Linspace
<
DeviceContext
,
T
>
linspace
;
// Get indexes of height with shape [height, width, 1]
auto
h_idx
=
linspace
((
T
)
-
1
,
(
T
)
1
,
h
,
ctx
);
auto
h_idx_t
=
EigenTensor
<
T
,
1
>::
From
(
h_idx
);
// Get indexes of width with shape [height, width, 1]
auto
w_idx
=
linspace
((
T
)
-
1
,
(
T
)
1
,
w
,
ctx
);
auto
w_idx_t
=
EigenTensor
<
T
,
1
>::
From
(
w_idx
);
// Get constant ones tensor with shape [height, width, 1]
Tensor
ones
;
ones
.
mutable_data
<
T
>
({
h
,
w
,
1
},
ctx
.
GetPlace
());
auto
ones_t
=
EigenTensor
<
T
,
3
>::
From
(
ones
).
setConstant
((
T
)
1
);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor
grid
;
grid
.
mutable_data
<
T
>
({
n
,
h
,
w
,
3
},
ctx
.
GetPlace
());
auto
grid_t
=
EigenTensor
<
T
,
4
>::
From
(
grid
);
grid_t
.
device
(
place
)
=
w_idx_t
.
reshape
(
Array2
(
1
,
w
))
.
broadcast
(
Array2
(
h
,
1
))
.
reshape
(
Array3
(
h
,
w
,
1
))
.
concatenate
(
h_idx_t
.
reshape
(
Array2
(
1
,
h
))
.
broadcast
(
Array2
(
w
,
1
))
.
shuffle
(
Array2
(
1
,
0
))
.
reshape
(
Array3
(
h
,
w
,
1
)),
2
)
.
eval
()
.
concatenate
(
ones_t
,
2
)
.
reshape
(
Array4
(
1
,
h
,
w
,
3
))
.
broadcast
(
Array4
(
n
,
1
,
1
,
1
));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
Tensor
sliced_grid
=
grid
.
Slice
(
i
,
i
+
1
).
Resize
({
h
*
w
,
3
});
Tensor
sliced_theta
=
theta
->
Slice
(
i
,
i
+
1
).
Resize
({
2
,
3
});
Tensor
sliced_out
=
output
->
Slice
(
i
,
i
+
1
).
Resize
({
h
*
w
,
2
});
blas
.
MatMul
(
sliced_grid
,
false
,
sliced_theta
,
true
,
T
(
1
),
&
sliced_out
,
T
(
0
));
}
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
AffineGridGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
theta_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Theta"
));
int
n
=
output_grad
->
dims
()[
0
];
auto
size_attr
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"output_shape"
);
int
h
=
0
;
int
w
=
0
;
if
(
size_attr
.
size
()
==
0
)
{
auto
*
output_shape
=
ctx
.
Input
<
Tensor
>
(
"OutputShape"
);
Tensor
h_sizes
;
framework
::
TensorCopy
(
*
output_shape
,
platform
::
CPUPlace
(),
&
h_sizes
);
const
int
*
h_size_data
=
h_sizes
.
data
<
int
>
();
h
=
h_size_data
[
2
];
w
=
h_size_data
[
3
];
}
else
{
h
=
size_attr
[
2
];
w
=
size_attr
[
3
];
}
theta_grad
->
mutable_data
<
T
>
({
n
,
2
,
3
},
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
theta_grad
,
static_cast
<
T
>
(
0
));
Linspace
<
DeviceContext
,
T
>
linspace
;
// Get indexes of height with shape [height, width, 1]
auto
h_idx
=
linspace
((
T
)
-
1
,
(
T
)
1
,
h
,
ctx
);
auto
h_idx_t
=
EigenTensor
<
T
,
1
>::
From
(
h_idx
);
// Get indexes of width with shape [height, width, 1]
auto
w_idx
=
linspace
((
T
)
-
1
,
(
T
)
1
,
w
,
ctx
);
auto
w_idx_t
=
EigenTensor
<
T
,
1
>::
From
(
w_idx
);
// Get constant ones tensor with shape [height, width, 1]
Tensor
ones
;
ones
.
mutable_data
<
T
>
({
h
,
w
,
1
},
ctx
.
GetPlace
());
auto
ones_t
=
EigenTensor
<
T
,
3
>::
From
(
ones
).
setConstant
((
T
)
1
);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones
Tensor
grid
;
grid
.
mutable_data
<
T
>
({
n
,
h
,
w
,
3
},
ctx
.
GetPlace
());
auto
grid_t
=
EigenTensor
<
T
,
4
>::
From
(
grid
);
grid_t
.
device
(
place
)
=
w_idx_t
.
reshape
(
Array2
(
1
,
w
))
.
broadcast
(
Array2
(
h
,
1
))
.
reshape
(
Array3
(
h
,
w
,
1
))
.
concatenate
(
h_idx_t
.
reshape
(
Array2
(
1
,
h
))
.
broadcast
(
Array2
(
w
,
1
))
.
shuffle
(
Array2
(
1
,
0
))
.
reshape
(
Array3
(
h
,
w
,
1
)),
2
)
.
eval
()
.
concatenate
(
ones_t
,
2
)
.
reshape
(
Array4
(
1
,
h
,
w
,
3
))
.
broadcast
(
Array4
(
n
,
1
,
1
,
1
));
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto
blas
=
math
::
GetBlas
<
DeviceContext
,
T
>
(
ctx
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
Tensor
sliced_grid
=
grid
.
Slice
(
i
,
i
+
1
).
Resize
({
h
*
w
,
3
});
Tensor
sliced_out_grad
=
output_grad
->
Slice
(
i
,
i
+
1
).
Resize
({
h
*
w
,
2
});
Tensor
sliced_theta_grad
=
theta_grad
->
Slice
(
i
,
i
+
1
).
Resize
({
2
,
3
});
blas
.
MatMul
(
sliced_out_grad
,
true
,
sliced_grid
,
false
,
T
(
1
),
&
sliced_theta_grad
,
T
(
0
));
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/delete_var_op.cc
浏览文件 @
a412826c
...
...
@@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase {
}
};
class
DeleteVarOpShapeInference
:
public
framework
::
InferShapeBase
{
public:
void
operator
()(
framework
::
InferShapeContext
*
ctx
)
const
override
{}
};
class
DeleteVarOpInfoMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
...
...
@@ -48,4 +53,5 @@ It should not be configured by users directly.
REGISTER_OPERATOR
(
delete_var
,
paddle
::
operators
::
DeleteVarOp
,
paddle
::
framework
::
EmptyGradOpMaker
,
paddle
::
operators
::
DeleteVarOpInfoMaker
);
paddle
::
operators
::
DeleteVarOpInfoMaker
,
paddle
::
operators
::
DeleteVarOpShapeInference
);
paddle/fluid/operators/gather_op.cc
浏览文件 @
a412826c
...
...
@@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
gather_grad
,
ops
::
GatherGradOp
);
REGISTER_OP_CPU_KERNEL
(
gather
,
ops
::
GatherOpKernel
<
float
>
,
ops
::
GatherOpKernel
<
int
>
,
ops
::
GatherOpKernel
<
double
>
);
ops
::
GatherOpKernel
<
double
>
,
ops
::
GatherOpKernel
<
int
>
,
ops
::
GatherOpKernel
<
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
gather_grad
,
ops
::
GatherGradientOpKernel
<
float
>
,
ops
::
GatherGradientOpKernel
<
double
>
,
ops
::
GatherGradientOpKernel
<
int
>
,
ops
::
GatherGradientOpKernel
<
double
>
);
ops
::
GatherGradientOpKernel
<
int64_t
>
);
paddle/fluid/operators/gather_op.cu
浏览文件 @
a412826c
...
...
@@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
gather
,
ops
::
GatherOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
gather_grad
,
ops
::
GatherGradOpCUDAKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
gather
,
ops
::
GatherOpCUDAKernel
<
float
>
,
ops
::
GatherOpCUDAKernel
<
double
>
,
ops
::
GatherOpCUDAKernel
<
int64_t
>
,
ops
::
GatherOpCUDAKernel
<
int
>
);
REGISTER_OP_CUDA_KERNEL
(
gather_grad
,
ops
::
GatherGradOpCUDAKernel
<
float
>
,
ops
::
GatherGradOpCUDAKernel
<
double
>
,
ops
::
GatherGradOpCUDAKernel
<
int64_t
>
,
ops
::
GatherGradOpCUDAKernel
<
int
>
);
paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
framework
::
Tensor
;
using
ScopedTensorDescriptor
=
platform
::
ScopedTensorDescriptor
;
using
DataLayout
=
platform
::
DataLayout
;
using
ScopedSpatialTransformerDescriptor
=
platform
::
ScopedSpatialTransformerDescriptor
;
template
<
typename
T
>
using
CudnnDataType
=
platform
::
CudnnDataType
<
T
>
;
template
<
typename
T
>
class
CUDNNGridSampleOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
grid
=
ctx
.
Input
<
Tensor
>
(
"Grid"
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
int
n
=
input
->
dims
()[
0
];
int
c
=
input
->
dims
()[
1
];
int
h
=
input
->
dims
()[
2
];
int
w
=
input
->
dims
()[
3
];
const
int
size
[
4
]
=
{
n
,
c
,
h
,
w
};
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
grid_data
=
grid
->
data
<
T
>
();
T
*
output_data
=
output
->
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
ScopedSpatialTransformerDescriptor
st_desc
;
cudnnSpatialTransformerDescriptor_t
cudnn_st_desc
=
st_desc
.
descriptor
<
T
>
(
4
,
size
);
ScopedTensorDescriptor
input_desc
;
ScopedTensorDescriptor
output_desc
;
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize2int
(
input
->
dims
()));
cudnnTensorDescriptor_t
cudnn_output_desc
=
output_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize2int
(
output
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfSamplerForward
(
handle
,
cudnn_st_desc
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_input_desc
,
input_data
,
grid_data
,
CudnnDataType
<
T
>::
kZero
(),
cudnn_output_desc
,
output_data
));
}
};
template
<
typename
T
>
class
CUDNNGridSampleGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_ENFORCE
(
platform
::
is_gpu_place
(
ctx
.
GetPlace
()),
"It must use CUDAPlace"
);
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
handle
=
dev_ctx
.
cudnn_handle
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
grid
=
ctx
.
Input
<
Tensor
>
(
"Grid"
);
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
grid_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Grid"
));
auto
output_grad_dims
=
output_grad
->
dims
();
const
int
n
=
output_grad_dims
[
0
];
const
int
c
=
output_grad_dims
[
1
];
const
int
h
=
output_grad_dims
[
2
];
const
int
w
=
output_grad_dims
[
3
];
const
int
size
[
4
]
=
{
n
,
c
,
h
,
w
};
ScopedSpatialTransformerDescriptor
st_dest
;
cudnnSpatialTransformerDescriptor_t
cudnn_st_dest
=
st_dest
.
descriptor
<
T
>
(
4
,
size
);
const
T
*
input_data
=
input
->
data
<
T
>
();
const
T
*
grid_data
=
grid
->
data
<
T
>
();
const
T
*
output_grad_data
=
output_grad
->
data
<
T
>
();
T
*
input_grad_data
=
input_grad
->
mutable_data
<
T
>
(
output_grad_dims
,
ctx
.
GetPlace
());
T
*
grid_grad_data
=
grid_grad
->
mutable_data
<
T
>
({
n
,
h
,
w
,
2
},
ctx
.
GetPlace
());
ScopedTensorDescriptor
input_desc
;
ScopedTensorDescriptor
input_grad_desc
;
ScopedTensorDescriptor
output_grad_desc
;
cudnnTensorDescriptor_t
cudnn_input_desc
=
input_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize2int
(
input
->
dims
()));
cudnnTensorDescriptor_t
cudnn_input_grad_desc
=
input_grad_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize2int
(
input_grad
->
dims
()));
cudnnTensorDescriptor_t
cudnn_output_grad_desc
=
output_grad_desc
.
descriptor
<
T
>
(
DataLayout
::
kNCHW
,
framework
::
vectorize2int
(
output_grad
->
dims
()));
CUDNN_ENFORCE
(
platform
::
dynload
::
cudnnSpatialTfSamplerBackward
(
handle
,
cudnn_st_dest
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_input_desc
,
input_data
,
CudnnDataType
<
T
>::
kZero
(),
cudnn_input_grad_desc
,
input_grad_data
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_output_grad_desc
,
output_grad_data
,
grid_data
,
CudnnDataType
<
T
>::
kZero
(),
grid_grad_data
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_KERNEL
(
grid_sampler
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNGridSampleOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNGridSampleOpKernel
<
double
>
);
REGISTER_OP_KERNEL
(
grid_sampler_grad
,
CUDNN
,
plat
::
CUDAPlace
,
paddle
::
operators
::
CUDNNGridSampleGradOpKernel
<
float
>
,
paddle
::
operators
::
CUDNNGridSampleGradOpKernel
<
double
>
);
paddle/fluid/operators/grid_sampler_op.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/grid_sampler_op.h"
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
class
GridSampleOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"X"
),
"Input(X) of GridSampleOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasInput
(
"Grid"
),
"Input(Grid) of GridSampleOp should not be null."
);
PADDLE_ENFORCE
(
ctx
->
HasOutput
(
"Output"
),
"Output(Output) of GridSampleOp should not be null."
);
auto
x_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
grid_dims
=
ctx
->
GetInputDim
(
"Grid"
);
PADDLE_ENFORCE
(
x_dims
.
size
()
==
4
,
"Input(X) of GridSampleOp should be 4-D Tensor."
);
PADDLE_ENFORCE
(
grid_dims
.
size
()
==
4
,
"Input(Grid) of GridSampleOp should be 4-D Tensor."
);
PADDLE_ENFORCE
(
grid_dims
[
3
]
==
2
,
"Input(Grid) dims[3] should be 2."
);
PADDLE_ENFORCE_EQ
(
grid_dims
[
0
],
x_dims
[
0
],
"Input(X) and Input(Grid) dims[0] should be equal."
);
PADDLE_ENFORCE_EQ
(
grid_dims
[
1
],
x_dims
[
2
],
"Input(X) dims[2] and Input(Grid) dims[1] should be equal."
);
PADDLE_ENFORCE_EQ
(
grid_dims
[
2
],
x_dims
[
3
],
"Input(X) dims[3] and Input(Grid) dims[2] should be equal."
);
ctx
->
SetOutputDim
(
"Output"
,
x_dims
);
ctx
->
ShareLoD
(
"X"
,
"Output"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kAnyLayout
,
library_
);
}
};
class
GridSampleOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) The input data of GridSampleOp, "
"This is a 4-D tensor with shape of [N, C, H, W]"
);
AddInput
(
"Grid"
,
"(Tensor) The input grid of GridSampleOp generated by AffineGridOp, "
"This is a 4-D tensor with shape of [N, H, W, 2] is the concatenation "
"of x and y coordinates with shape [N, H, W] in last dimention"
);
AddOutput
(
"Output"
,
"(Tensor) Output tensor with shape [N, C, H, W]"
);
AddAttr
<
bool
>
(
"use_cudnn"
,
"(bool, default true) Only used in cudnn kernel, need install cudnn"
)
.
SetDefault
(
true
);
AddComment
(
R"DOC(
This operation samples input X by using bilinear interpolation based on
flow field grid, which is usually gennerated by affine_grid. The grid of
shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
with shape [N, H, W] each, where grid_x is indexing the 4th dimension
(in width dimension) of input data x and grid_y is indexng the 3rd
dimention (in height dimension), finally results is the bilinear
interpolation value of 4 nearest corner points.
Step 1:
Get (x, y) grid coordinates and scale to [0, H-1/W-1].
grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
Step 2:
Indices input data X with grid (x, y) in each [H, W] area, and bilinear
interpolate point value by 4 nearest points.
wn ------- y_n ------- en
| | |
| d_n |
| | |
x_w --d_w-- grid--d_e-- x_e
| | |
| d_s |
| | |
ws ------- y_s ------- wn
x_w = floor(x) // west side x coord
x_e = x_w + 1 // east side x coord
y_n = floor(y) // north side y coord
y_s = y_s + 1 // south side y coord
d_w = grid_x - x_w // distance to west side
d_e = x_e - grid_x // distance to east side
d_n = grid_y - y_n // distance to north side
d_s = y_s - grid_y // distance to south side
wn = X[:, :, y_n, x_w] // north-west point value
en = X[:, :, y_n, x_e] // north-east point value
ws = X[:, :, y_s, x_w] // south-east point value
es = X[:, :, y_s, x_w] // north-east point value
output = wn * d_e * d_s + en * d_w * d_s
+ ws * d_e * d_n + es * d_w * d_n
)DOC"
);
}
};
class
GridSampleOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
auto
input_dims
=
ctx
->
GetInputDim
(
"X"
);
auto
grid_dims
=
ctx
->
GetInputDim
(
"Grid"
);
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"X"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
input_dims
);
}
if
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Grid"
)))
{
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Grid"
),
grid_dims
);
}
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
framework
::
LibraryType
library_
{
framework
::
LibraryType
::
kPlain
};
#ifdef PADDLE_WITH_CUDA
if
(
platform
::
CanCUDNNBeUsed
(
ctx
))
{
library_
=
framework
::
LibraryType
::
kCUDNN
;
}
#endif
return
framework
::
OpKernelType
(
framework
::
ToDataType
(
ctx
.
Input
<
Tensor
>
(
"X"
)
->
type
()),
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kAnyLayout
,
library_
);
}
};
class
GridSampleGradMaker
:
public
framework
::
SingleGradOpDescMaker
{
public:
using
framework
::
SingleGradOpDescMaker
::
SingleGradOpDescMaker
;
protected:
std
::
unique_ptr
<
framework
::
OpDesc
>
Apply
()
const
override
{
auto
*
op
=
new
framework
::
OpDesc
();
op
->
SetType
(
"grid_sampler_grad"
);
op
->
SetInput
(
"X"
,
Input
(
"X"
));
op
->
SetInput
(
"Grid"
,
Input
(
"Grid"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Output"
),
OutputGrad
(
"Output"
));
op
->
SetAttrMap
(
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
InputGrad
(
"X"
));
op
->
SetOutput
(
framework
::
GradVarName
(
"Grid"
),
InputGrad
(
"Grid"
));
return
std
::
unique_ptr
<
framework
::
OpDesc
>
(
op
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OPERATOR
(
grid_sampler
,
ops
::
GridSampleOp
,
ops
::
GridSampleOpMaker
,
ops
::
GridSampleGradMaker
);
REGISTER_OPERATOR
(
grid_sampler_grad
,
ops
::
GridSampleOpGrad
);
REGISTER_OP_CPU_KERNEL
(
grid_sampler
,
ops
::
GridSampleOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GridSampleOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
REGISTER_OP_CPU_KERNEL
(
grid_sampler_grad
,
ops
::
GridSampleGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
GridSampleGradOpKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/grid_sampler_op.h
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/gather.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
template
<
typename
T
,
size_t
D
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenTensor
=
framework
::
EigenTensor
<
T
,
D
,
MajorType
,
IndexType
>
;
using
Array3
=
Eigen
::
DSizes
<
int64_t
,
3
>
;
using
Array4
=
Eigen
::
DSizes
<
int64_t
,
4
>
;
template
<
typename
T
>
static
inline
bool
isInBound
(
T
x
,
T
y
,
T
x_max
,
T
y_max
)
{
if
(
x
<
0
||
x
>
x_max
||
y
<
0
||
y
>
y_max
)
{
return
false
;
}
return
true
;
}
template
<
typename
T
>
static
void
CalcGridLocations
(
const
platform
::
CPUDeviceContext
&
ctx
,
const
Tensor
&
grid
,
Tensor
*
x_w
,
Tensor
*
x_e
,
Tensor
*
y_n
,
Tensor
*
y_s
,
Tensor
*
d_w
,
Tensor
*
d_e
,
Tensor
*
d_n
,
Tensor
*
d_s
)
{
auto
&
place
=
*
ctx
.
eigen_device
();
const
int
n
=
grid
.
dims
()[
0
];
const
int
h
=
grid
.
dims
()[
1
];
const
int
w
=
grid
.
dims
()[
2
];
const
T
x_max
=
static_cast
<
T
>
(
w
-
1
);
const
T
y_max
=
static_cast
<
T
>
(
h
-
1
);
// split grid with shape (n, h, w, 2) into (x, y) by the 3rd Dim
Tensor
grid_x
,
grid_y
;
T
*
grid_x_data
=
grid_x
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
T
*
grid_y_data
=
grid_y
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
const
T
*
grid_data
=
grid
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
n
*
h
*
w
;
i
++
)
{
grid_x_data
[
i
]
=
grid_data
[
2
*
i
];
grid_y_data
[
i
]
=
grid_data
[(
2
*
i
)
+
1
];
}
Tensor
ones
;
ones
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
ones_t
=
EigenTensor
<
T
,
3
>::
From
(
ones
).
setConstant
(
1.0
);
// scale grid to [0, h-1/w-1]
auto
grid_x_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_x
);
auto
grid_y_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_y
);
grid_x_t
.
device
(
place
)
=
0.5
*
((
grid_x_t
+
ones_t
)
*
x_max
);
grid_y_t
.
device
(
place
)
=
0.5
*
((
grid_y_t
+
ones_t
)
*
y_max
);
// calculate coords of 4 corner points
x_w
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
x_e
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
y_n
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
y_s
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
x_w_t
=
EigenTensor
<
T
,
3
>::
From
(
*
x_w
);
auto
x_e_t
=
EigenTensor
<
T
,
3
>::
From
(
*
x_e
);
auto
y_n_t
=
EigenTensor
<
T
,
3
>::
From
(
*
y_n
);
auto
y_s_t
=
EigenTensor
<
T
,
3
>::
From
(
*
y_s
);
x_w_t
.
device
(
place
)
=
grid_x_t
.
floor
();
x_e_t
.
device
(
place
)
=
x_w_t
+
ones_t
;
y_n_t
.
device
(
place
)
=
grid_y_t
.
floor
();
y_s_t
.
device
(
place
)
=
y_n_t
+
ones_t
;
// calculate distances to 4 sides
d_w
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
d_e
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
d_n
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
d_s
->
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
d_w_t
=
EigenTensor
<
T
,
3
>::
From
(
*
d_w
);
auto
d_e_t
=
EigenTensor
<
T
,
3
>::
From
(
*
d_e
);
auto
d_n_t
=
EigenTensor
<
T
,
3
>::
From
(
*
d_n
);
auto
d_s_t
=
EigenTensor
<
T
,
3
>::
From
(
*
d_s
);
d_w_t
.
device
(
place
)
=
grid_x_t
-
x_w_t
;
d_e_t
.
device
(
place
)
=
x_e_t
-
grid_x_t
;
d_n_t
.
device
(
place
)
=
grid_y_t
-
y_n_t
;
d_s_t
.
device
(
place
)
=
y_s_t
-
grid_y_t
;
}
template
<
typename
T
>
static
void
GetGridPointValue
(
const
Tensor
&
input
,
Tensor
*
output
,
const
Tensor
&
x
,
const
Tensor
&
y
)
{
const
int
n
=
input
.
dims
()[
0
];
const
int
c
=
input
.
dims
()[
1
];
const
int
h
=
input
.
dims
()[
2
];
const
int
w
=
input
.
dims
()[
3
];
auto
x_t
=
EigenTensor
<
T
,
3
>::
From
(
x
);
auto
y_t
=
EigenTensor
<
T
,
3
>::
From
(
y
);
auto
output_t
=
EigenTensor
<
T
,
4
>::
From
(
*
output
).
setConstant
((
T
)
0
);
auto
input_t
=
EigenTensor
<
T
,
4
>::
From
(
input
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
k
=
0
;
k
<
h
;
k
++
)
{
for
(
int
l
=
0
;
l
<
w
;
l
++
)
{
if
(
isInBound
(
x_t
(
i
,
k
,
l
),
y_t
(
i
,
k
,
l
),
(
T
)(
w
-
1
),
(
T
)(
h
-
1
)))
{
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
output_t
(
i
,
j
,
k
,
l
)
=
input_t
(
i
,
j
,
static_cast
<
int
>
(
round
(
y_t
(
i
,
k
,
l
))),
static_cast
<
int
>
(
round
(
x_t
(
i
,
k
,
l
))));
}
}
}
}
}
}
template
<
typename
T
>
static
void
GatherOutputGradToInputGrad
(
const
Tensor
&
output_grad
,
Tensor
*
input_grad
,
const
Tensor
&
x
,
const
Tensor
&
y
,
const
Tensor
&
d1
,
const
Tensor
&
d2
)
{
const
int
n
=
output_grad
.
dims
()[
0
];
const
int
c
=
output_grad
.
dims
()[
1
];
const
int
h
=
output_grad
.
dims
()[
2
];
const
int
w
=
output_grad
.
dims
()[
3
];
auto
x_t
=
EigenTensor
<
T
,
3
>::
From
(
x
);
auto
y_t
=
EigenTensor
<
T
,
3
>::
From
(
y
);
auto
d1_t
=
EigenTensor
<
T
,
3
>::
From
(
d1
);
auto
d2_t
=
EigenTensor
<
T
,
3
>::
From
(
d2
);
auto
input_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
*
input_grad
);
auto
output_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
output_grad
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
k
=
0
;
k
<
h
;
k
++
)
{
for
(
int
l
=
0
;
l
<
w
;
l
++
)
{
if
(
isInBound
(
x_t
(
i
,
k
,
l
),
y_t
(
i
,
k
,
l
),
(
T
)(
w
-
1
),
(
T
)(
h
-
1
)))
{
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
input_grad_t
(
i
,
j
,
static_cast
<
int
>
(
round
(
y_t
(
i
,
k
,
l
))),
static_cast
<
int
>
(
round
(
x_t
(
i
,
k
,
l
))))
+=
output_grad_t
(
i
,
j
,
k
,
l
)
*
d1_t
(
i
,
k
,
l
)
*
d2_t
(
i
,
k
,
l
);
}
}
}
}
}
}
template
<
typename
DeviceContext
,
typename
T
>
class
GridSampleOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
place
=
*
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
();
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
grid
=
ctx
.
Input
<
Tensor
>
(
"Grid"
);
const
int
n
=
input
->
dims
()[
0
];
const
int
c
=
input
->
dims
()[
1
];
const
int
h
=
input
->
dims
()[
2
];
const
int
w
=
input
->
dims
()[
3
];
// calc locations and distances of 4 corner points
Tensor
x_w
,
x_e
,
y_n
,
y_s
;
Tensor
d_w
,
d_e
,
d_n
,
d_s
;
CalcGridLocations
<
T
>
(
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>(),
*
grid
,
&
x_w
,
&
x_e
,
&
y_n
,
&
y_s
,
&
d_w
,
&
d_e
,
&
d_n
,
&
d_s
);
auto
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
output
->
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
output
,
static_cast
<
T
>
(
0
));
// calc 4 corner points value
Tensor
v_wn
,
v_en
,
v_ws
,
v_es
;
v_wn
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_en
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_ws
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_es
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
GetGridPointValue
<
T
>
(
*
input
,
&
v_wn
,
x_w
,
y_n
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_en
,
x_e
,
y_n
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_ws
,
x_w
,
y_s
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_es
,
x_e
,
y_s
);
auto
d_w_t
=
EigenTensor
<
T
,
3
>::
From
(
d_w
);
auto
d_e_t
=
EigenTensor
<
T
,
3
>::
From
(
d_e
);
auto
d_n_t
=
EigenTensor
<
T
,
3
>::
From
(
d_n
);
auto
d_s_t
=
EigenTensor
<
T
,
3
>::
From
(
d_s
);
auto
d_w_scaled_t
=
d_w_t
.
reshape
(
Array4
(
n
,
1
,
h
,
w
)).
broadcast
(
Array4
(
1
,
c
,
1
,
1
));
auto
d_e_scaled_t
=
d_e_t
.
reshape
(
Array4
(
n
,
1
,
h
,
w
)).
broadcast
(
Array4
(
1
,
c
,
1
,
1
));
auto
d_n_scaled_t
=
d_n_t
.
reshape
(
Array4
(
n
,
1
,
h
,
w
)).
broadcast
(
Array4
(
1
,
c
,
1
,
1
));
auto
d_s_scaled_t
=
d_s_t
.
reshape
(
Array4
(
n
,
1
,
h
,
w
)).
broadcast
(
Array4
(
1
,
c
,
1
,
1
));
auto
v_wn_t
=
EigenTensor
<
T
,
4
>::
From
(
v_wn
);
auto
v_en_t
=
EigenTensor
<
T
,
4
>::
From
(
v_en
);
auto
v_ws_t
=
EigenTensor
<
T
,
4
>::
From
(
v_ws
);
auto
v_es_t
=
EigenTensor
<
T
,
4
>::
From
(
v_es
);
auto
output_t
=
EigenTensor
<
T
,
4
>::
From
(
*
output
);
// bilinear interpolaetion by 4 corner points
output_t
.
device
(
place
)
=
v_wn_t
*
d_e_scaled_t
*
d_s_scaled_t
+
v_en_t
*
d_w_scaled_t
*
d_s_scaled_t
+
v_ws_t
*
d_e_scaled_t
*
d_n_scaled_t
+
v_es_t
*
d_w_scaled_t
*
d_n_scaled_t
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
GridSampleGradOpKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
input
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
grid
=
ctx
.
Input
<
Tensor
>
(
"Grid"
);
auto
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
const
int
n
=
input
->
dims
()[
0
];
const
int
c
=
input
->
dims
()[
1
];
const
int
h
=
input
->
dims
()[
2
];
const
int
w
=
input
->
dims
()[
3
];
auto
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
input_grad
->
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
input_grad
,
static_cast
<
T
>
(
0
));
auto
*
grid_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Grid"
));
grid_grad
->
mutable_data
<
T
>
({
n
,
h
,
w
,
2
},
ctx
.
GetPlace
());
math
::
SetConstant
<
DeviceContext
,
T
>
()(
ctx
.
template
device_context
<
DeviceContext
>(),
grid_grad
,
static_cast
<
T
>
(
0
));
Tensor
x_w
,
x_e
,
y_n
,
y_s
;
Tensor
d_w
,
d_e
,
d_n
,
d_s
;
CalcGridLocations
<
T
>
(
ctx
.
template
device_context
<
platform
::
CPUDeviceContext
>(),
*
grid
,
&
x_w
,
&
x_e
,
&
y_n
,
&
y_s
,
&
d_w
,
&
d_e
,
&
d_n
,
&
d_s
);
// gather output grad value to input grad by corner point coords and weight
GatherOutputGradToInputGrad
<
T
>
(
*
output_grad
,
input_grad
,
x_w
,
y_n
,
d_e
,
d_s
);
GatherOutputGradToInputGrad
<
T
>
(
*
output_grad
,
input_grad
,
x_w
,
y_s
,
d_e
,
d_n
);
GatherOutputGradToInputGrad
<
T
>
(
*
output_grad
,
input_grad
,
x_e
,
y_n
,
d_w
,
d_s
);
GatherOutputGradToInputGrad
<
T
>
(
*
output_grad
,
input_grad
,
x_e
,
y_s
,
d_w
,
d_n
);
// calc 4 corner points value
Tensor
v_wn
,
v_en
,
v_ws
,
v_es
;
v_wn
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_en
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_ws
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
v_es
.
mutable_data
<
T
>
({
n
,
c
,
h
,
w
},
ctx
.
GetPlace
());
GetGridPointValue
<
T
>
(
*
input
,
&
v_wn
,
x_w
,
y_n
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_en
,
x_e
,
y_n
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_ws
,
x_w
,
y_s
);
GetGridPointValue
<
T
>
(
*
input
,
&
v_es
,
x_e
,
y_s
);
auto
v_wn_t
=
EigenTensor
<
T
,
4
>::
From
(
v_wn
);
auto
v_en_t
=
EigenTensor
<
T
,
4
>::
From
(
v_en
);
auto
v_ws_t
=
EigenTensor
<
T
,
4
>::
From
(
v_ws
);
auto
v_es_t
=
EigenTensor
<
T
,
4
>::
From
(
v_es
);
auto
d_w_t
=
EigenTensor
<
T
,
3
>::
From
(
d_w
);
auto
d_e_t
=
EigenTensor
<
T
,
3
>::
From
(
d_e
);
auto
d_n_t
=
EigenTensor
<
T
,
3
>::
From
(
d_n
);
auto
d_s_t
=
EigenTensor
<
T
,
3
>::
From
(
d_s
);
auto
output_grad_t
=
EigenTensor
<
T
,
4
>::
From
(
*
output_grad
);
Tensor
grid_grad_x
,
grid_grad_y
;
grid_grad_x
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
grid_grad_y
.
mutable_data
<
T
>
({
n
,
h
,
w
},
ctx
.
GetPlace
());
auto
grid_grad_x_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_grad_x
).
setConstant
(
0.0
);
auto
grid_grad_y_t
=
EigenTensor
<
T
,
3
>::
From
(
grid_grad_y
).
setConstant
(
0.0
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
for
(
int
j
=
0
;
j
<
c
;
j
++
)
{
for
(
int
k
=
0
;
k
<
h
;
k
++
)
{
for
(
int
l
=
0
;
l
<
w
;
l
++
)
{
grid_grad_x_t
(
i
,
k
,
l
)
+=
((
v_en_t
(
i
,
j
,
k
,
l
)
-
v_wn_t
(
i
,
j
,
k
,
l
))
*
d_s_t
(
i
,
k
,
l
)
+
(
v_es_t
(
i
,
j
,
k
,
l
)
-
v_ws_t
(
i
,
j
,
k
,
l
))
*
d_n_t
(
i
,
k
,
l
))
*
output_grad_t
(
i
,
j
,
k
,
l
);
grid_grad_y_t
(
i
,
k
,
l
)
+=
((
v_ws_t
(
i
,
j
,
k
,
l
)
-
v_wn_t
(
i
,
j
,
k
,
l
))
*
d_e_t
(
i
,
k
,
l
)
+
(
v_es_t
(
i
,
j
,
k
,
l
)
-
v_en_t
(
i
,
j
,
k
,
l
))
*
d_w_t
(
i
,
k
,
l
))
*
output_grad_t
(
i
,
j
,
k
,
l
);
}
}
}
}
const
T
x_max
=
static_cast
<
T
>
(
w
-
1
);
const
T
y_max
=
static_cast
<
T
>
(
h
-
1
);
grid_grad_x_t
=
grid_grad_x_t
*
(
x_max
/
(
T
)
2
);
grid_grad_y_t
=
grid_grad_y_t
*
(
y_max
/
(
T
)
2
);
// gather grid_grad [x, y] in 3rd Dim
T
*
grid_grad_data
=
grid_grad
->
data
<
T
>
();
T
*
grid_grad_x_data
=
grid_grad_x
.
data
<
T
>
();
T
*
grid_grad_y_data
=
grid_grad_y
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
n
*
h
*
w
;
i
++
)
{
grid_grad_data
[
2
*
i
]
=
grid_grad_x_data
[
i
];
grid_grad_data
[
2
*
i
+
1
]
=
grid_grad_y_data
[
i
];
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -76,6 +76,6 @@ endif()
cc_test
(
concat_test SRCS concat_test.cc DEPS concat_and_split
)
cc_test
(
cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info
)
cc_library
(
jit_kernel
SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas
)
SRCS jit_kernel.cc jit_
gen.cc jit_code.cc jit_
kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc
DEPS cpu_info cblas
gflags enforce
)
cc_test
(
jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel
)
paddle/fluid/operators/math/jit_code.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
gen
{
using
namespace
platform
::
jit
;
// NOLINT
bool
VMulJitCode
::
init
(
int
d
)
{
// TODO(TJ): maybe one AVX is enough, AVX above would slow down freq
// try more with avx2 or avx512
if
(
MayIUse
(
avx
)
||
MayIUse
(
avx2
))
{
return
d
%
AVX_FLOAT_BLOCK
==
0
;
}
else
{
return
false
;
}
}
void
VMulJitCode
::
generate
()
{
// do not need push stack, and do not need save avx512reg if do not use avx512
int
stride
=
sizeof
(
float
)
*
AVX_FLOAT_BLOCK
;
for
(
int
i
=
0
;
i
<
num_
/
AVX_FLOAT_BLOCK
;
++
i
)
{
vmovups
(
ymm_src1
,
ptr
[
param1
+
i
*
stride
]);
vmovups
(
ymm_src2
,
ptr
[
param2
+
i
*
stride
]);
vmulps
(
ymm_dst
,
ymm_src1
,
ymm_src2
);
vmovups
(
ptr
[
param3
+
stride
*
i
],
ymm_dst
);
}
ret
();
}
}
// namespace gen
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_code.h
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/jit_gen.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
gen
{
using
reg64_t
=
const
Xbyak
::
Reg64
;
using
reg32_t
=
const
Xbyak
::
Reg32
;
using
xmm_t
=
const
Xbyak
::
Xmm
;
using
ymm_t
=
const
Xbyak
::
Ymm
;
using
zmm_t
=
const
Xbyak
::
Zmm
;
using
Label
=
Xbyak
::
Label
;
class
VMulJitCode
:
public
JitCode
{
public:
DECLARE_JIT_CODE
(
VMulJitCode
);
explicit
VMulJitCode
(
int
d
,
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
JitCode
(
code_size
,
code_ptr
),
num_
(
d
)
{}
static
bool
init
(
int
d
);
void
generate
()
override
;
private:
int
num_
;
reg64_t
param1
{
abi_param1
};
reg64_t
param2
{
abi_param2
};
reg64_t
param3
{
abi_param3
};
xmm_t
xmm_src1
=
xmm_t
(
0
);
ymm_t
ymm_src1
=
ymm_t
(
0
);
zmm_t
zmm_src1
=
zmm_t
(
0
);
xmm_t
xmm_src2
=
xmm_t
(
1
);
ymm_t
ymm_src2
=
ymm_t
(
1
);
zmm_t
zmm_src2
=
zmm_t
(
1
);
xmm_t
xmm_dst
=
xmm_t
(
2
);
ymm_t
ymm_dst
=
ymm_t
(
2
);
zmm_t
zmm_dst
=
zmm_t
(
2
);
};
}
// namespace gen
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_gen.cc
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/jit_gen.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include "paddle/fluid/platform/cpu_info.h"
DEFINE_bool
(
dump_jitcode
,
false
,
"Whether to dump the jitcode to file"
);
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
gen
{
constexpr
Xbyak
::
Operand
::
Code
g_abi_regs
[]
=
{
Xbyak
::
Operand
::
RBX
,
Xbyak
::
Operand
::
RBP
,
Xbyak
::
Operand
::
R12
,
Xbyak
::
Operand
::
R13
,
Xbyak
::
Operand
::
R14
,
Xbyak
::
Operand
::
R15
};
constexpr
int
num_g_abi_regs
=
sizeof
(
g_abi_regs
)
/
sizeof
(
g_abi_regs
[
0
]);
void
JitCode
::
preCode
()
{
for
(
int
i
=
0
;
i
<
num_g_abi_regs
;
++
i
)
{
push
(
Xbyak
::
Reg64
(
g_abi_regs
[
i
]));
}
if
(
platform
::
jit
::
MayIUse
(
platform
::
jit
::
avx512f
))
{
mov
(
reg_EVEX_max_8b_offt
,
2
*
EVEX_max_8b_offt
);
}
}
void
JitCode
::
postCode
()
{
for
(
int
i
=
0
;
i
<
num_g_abi_regs
;
++
i
)
{
pop
(
Xbyak
::
Reg64
(
g_abi_regs
[
num_g_abi_regs
-
1
-
i
]));
}
ret
();
}
void
JitCode
::
dumpCode
(
const
Xbyak
::
uint8
*
code
)
const
{
if
(
code
)
{
static
int
counter
=
0
;
std
::
ostringstream
filename
;
filename
<<
"paddle_jitcode_"
<<
name
()
<<
"."
<<
counter
<<
".bin"
;
counter
++
;
std
::
ofstream
fout
(
filename
.
str
(),
std
::
ios
::
out
);
if
(
fout
.
is_open
())
{
fout
.
write
(
reinterpret_cast
<
const
char
*>
(
code
),
getSize
());
fout
.
close
();
}
}
}
Xbyak
::
Address
JitCode
::
EVEX_compress_addr
(
Xbyak
::
Reg64
base
,
int
offt
,
bool
bcast
)
{
int
scale
=
0
;
if
(
EVEX_max_8b_offt
<=
offt
&&
offt
<
3
*
EVEX_max_8b_offt
)
{
offt
=
offt
-
2
*
EVEX_max_8b_offt
;
scale
=
1
;
}
else
if
(
3
*
EVEX_max_8b_offt
<=
offt
&&
offt
<
5
*
EVEX_max_8b_offt
)
{
offt
=
offt
-
4
*
EVEX_max_8b_offt
;
scale
=
2
;
}
auto
re
=
Xbyak
::
RegExp
()
+
base
+
offt
;
if
(
scale
)
{
re
=
re
+
reg_EVEX_max_8b_offt
*
scale
;
}
if
(
bcast
)
{
return
zword_b
[
re
];
}
else
{
return
zword
[
re
];
}
}
}
// namespace gen
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_gen.h
0 → 100644
浏览文件 @
a412826c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <gflags/gflags.h>
#include <type_traits>
#include "paddle/fluid/platform/macros.h"
#define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
DECLARE_bool
(
dump_jitcode
);
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
gen
{
#define DECLARE_JIT_CODE(codename) \
const char *name() const override { return #codename; }
// Application Binary Interface
constexpr
Xbyak
::
Operand
::
Code
abi_param1
(
Xbyak
::
Operand
::
RDI
),
abi_param2
(
Xbyak
::
Operand
::
RSI
),
abi_param3
(
Xbyak
::
Operand
::
RDX
),
abi_param4
(
Xbyak
::
Operand
::
RCX
),
abi_not_param1
(
Xbyak
::
Operand
::
RCX
);
class
JitCode
:
public
Xbyak
::
CodeGenerator
{
public:
explicit
JitCode
(
size_t
code_size
=
256
*
1024
,
void
*
code_ptr
=
nullptr
)
:
Xbyak
::
CodeGenerator
(
code_size
,
code_ptr
)
{}
virtual
~
JitCode
()
{}
virtual
const
char
*
name
()
const
=
0
;
virtual
void
generate
()
=
0
;
template
<
typename
FUNC
>
const
FUNC
getCode
()
{
this
->
generate
();
const
Xbyak
::
uint8
*
code
=
CodeGenerator
::
getCode
();
if
(
FLAGS_dump_jitcode
)
{
this
->
dumpCode
(
code
);
}
return
reinterpret_cast
<
const
FUNC
>
(
code
);
}
DISABLE_COPY_AND_ASSIGN
(
JitCode
);
protected:
Xbyak
::
Reg64
param1
{
abi_param1
};
const
int
EVEX_max_8b_offt
=
0x200
;
const
Xbyak
::
Reg64
reg_EVEX_max_8b_offt
=
rbp
;
void
preCode
();
void
postCode
();
void
dumpCode
(
const
Xbyak
::
uint8
*
code
)
const
;
void
L
(
const
char
*
label
)
{
Xbyak
::
CodeGenerator
::
L
(
label
);
}
void
L
(
const
Xbyak
::
Label
&
label
)
{
Xbyak
::
CodeGenerator
::
L
(
label
);
}
// Enhanced vector extension
Xbyak
::
Address
EVEX_compress_addr
(
Xbyak
::
Reg64
base
,
int
offt
,
bool
bcast
=
false
);
};
}
// namespace gen
}
// namespace jitkernel
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
a412826c
...
...
@@ -39,6 +39,7 @@ class Kernel {
public:
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
// TODO(TJ): below members should be deprecated.
int
num_
{
0
};
int
end_
{
0
};
int
rest_
{
0
};
...
...
@@ -64,7 +65,7 @@ class KernelPool {
template
<
typename
T
>
class
VMulKernel
:
public
Kernel
{
public:
v
irtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
v
oid
(
*
Compute
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
a412826c
...
...
@@ -14,7 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/math/jit_kernel.h"
#include <string>
#include "paddle/fluid/operators/math/jit_code.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
...
...
@@ -27,65 +30,76 @@ namespace paddle {
namespace
operators
{
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
template
<
typename
T
>
void
VMulRefer
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
#ifdef PADDLE_WITH_MKLML
template
<
typename
T
>
void
VMulMKL
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
template
<
>
void
VMulMKL
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
}
template
<
>
void
VMulMKL
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
platform
::
dynload
::
vdMul
(
n
,
x
,
y
,
z
);
}
#endif
/* VMUL JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
template
<
typename
T
>
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
public:
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
static
inline
std
::
string
name
(
int
d
)
{
PADDLE_THROW
(
"DType should be either float or double"
);
}
};
static
inline
bool
useJIT
(
int
d
)
{
return
false
;
}
static
inline
bool
useMKL
(
int
d
)
{
return
false
;
}
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
if
(
useJIT
(
d
))
{
constexpr
size_t
sz
=
256
*
1024
;
// TODO(TJ): should be related with d
jitcode_
.
reset
(
new
gen
::
VMulJitCode
(
d
,
sz
));
this
->
Compute
=
jitcode_
->
getCode
<
void
(
*
)(
const
T
*
,
const
T
*
,
T
*
,
int
)
>
();
return
;
}
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \
template <> \
void VMulKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsMul(this->num_, x, y, z); \
if
(
useMKL
(
d
))
{
this
->
Compute
=
VMulMKL
<
T
>
;
return
;
}
#endif
this
->
Compute
=
VMulRefer
<
T
>
;
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VMulKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdMul(this->num_, x, y, z); \
}
private:
std
::
unique_ptr
<
gen
::
VMulJitCode
>
jitcode_
{
nullptr
};
};
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
template
<
>
bool
VMulKernelImpl
<
float
>::
useJIT
(
int
d
)
{
return
gen
::
VMulJitCode
::
init
(
d
);
}
#define INTRI8_FLOAT(isa) \
template <> \
void VMulKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_mul_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
template
<
>
bool
VMulKernelImpl
<
float
>::
useMKL
(
int
d
)
{
return
jit
::
MayIUse
(
jit
::
avx512f
)
&&
d
>
512
;
}
// avx > for > mkl
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
#endif
#ifdef __AVX2__
INTRI8_FLOAT
(
jit
::
avx2
);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT
(
jit
::
avx512f
);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef MKL_FLOAT
#undef MKL_DOUBLE
template
<
>
bool
VMulKernelImpl
<
double
>::
useMKL
(
int
d
)
{
return
true
;
}
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
/* VADD JitKernel */
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
...
...
@@ -465,13 +479,12 @@ INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL
(
vmul
,
VMulKernel
);
REGISTER_JITKERNEL
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL
(
videntity
,
VIdentityKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vadd
,
VAddKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vscal
,
VScalKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vaddb
,
VAddBiasKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vrelu
,
VReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
vaddrelu
,
VAddReluKernel
);
REGISTER_JITKERNEL_DEPRECATED
(
videntity
,
VIdentityKernel
);
}
// namespace jitkernel
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
浏览文件 @
a412826c
...
...
@@ -288,7 +288,7 @@ INTRIAVX512_FLOAT(kGT16);
#undef INIT_ALPHA
#undef UPDATE_ALPHA
REGISTER_JITKERNEL
(
crf_decode
,
CRFDecodeKernel
);
REGISTER_JITKERNEL
_DEPRECATED
(
crf_decode
,
CRFDecodeKernel
);
}
// namespace jitkernel
}
// namespace math
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
a412826c
...
...
@@ -250,7 +250,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
#undef MKL_FLOAT
#undef MKL_DOUBLE
REGISTER_JITKERNEL
(
vexp
,
VExpKernel
);
REGISTER_JITKERNEL
_DEPRECATED
(
vexp
,
VExpKernel
);
/* VSigmoid JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
...
...
@@ -396,7 +396,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
#undef INTRI_GT16_FLOAT
#undef INTRI_VSIGMOID
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
REGISTER_JITKERNEL
_DEPRECATED
(
vsigmoid
,
VSigmoidKernel
);
/* VTanh JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
...
...
@@ -531,7 +531,7 @@ INTRI16_FLOAT(jit::avx512f, detail::ExpAVX2);
#undef INTRI_GT16_FLOAT
#undef INTRI_VTANH
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
REGISTER_JITKERNEL
_DEPRECATED
(
vtanh
,
VTanhKernel
);
#undef JITKERNEL_NEW_ACT_IMPL
...
...
paddle/fluid/operators/math/jit_kernel_macro.h
浏览文件 @
a412826c
...
...
@@ -21,8 +21,71 @@ namespace operators {
namespace
math
{
namespace
jitkernel
{
namespace
jit
=
platform
::
jit
;
#define JITKERNEL_DEFINE_NAME(ker_key, ker_class) \
template <> \
std::string ker_class##Impl<float>::name(int d) { \
std::string key(#ker_key "f"); \
if (useJIT(d)) { \
/* only jit code need record d*/
\
return key + "jit" + std::to_string(d); \
} else if (useMKL(d)) { \
return key + "mkl"; \
} else { \
return key + "any"; \
} \
} \
template <> \
std::string ker_class##Impl<double>::name(int d) { \
std::string key(#ker_key "d"); \
/* jit code do not support double yet*/
\
if (useMKL(d)) { \
return key + "mkl"; \
} else { \
return key + "any"; \
} \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_FIND_KEY(ker_class, ker_dtype) \
std::string key = ker_class##Impl<ker_dtype>::name(d)
#define JITKERNEL_IMPL(ker_class, ker_dtype) \
p = std::dynamic_pointer_cast<ker_class<ker_dtype>>( \
std::make_shared<ker_class##Impl<ker_dtype>>(d))
#define REGISTER_JITKERNEL_WITH_DTYPE(ker_class, ker_dtype, marco_declare, \
macro_find_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
macro_find_key(ker_class, ker_dtype); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
macro_impl(ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \
marco_declare, macro_find_key, macro_impl) \
marco_define_name(ker_key, ker_class); \
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE, \
JITKERNEL_FIND_KEY, JITKERNEL_IMPL); \
REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE, \
JITKERNEL_FIND_KEY, JITKERNEL_IMPL)
#define REGISTER_JITKERNEL(ker_key, ker_class) \
REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \
JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \
JITKERNEL_IMPL)
namespace
jit
=
platform
::
jit
;
// TODO(TJ): below defines are deprecated, would be remove recently
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < AVX_FLOAT_BLOCK) { \
macro_(ker, dtype, isa, kLT8); \
...
...
@@ -47,44 +110,42 @@ namespace jit = platform::jit;
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \
}
#define JITKERNEL_DECLARE(ker_class, ker_dtype) \
template <> \
std::shared_ptr<const ker_class<ker_dtype>> \
KernelPool::Get<ker_class<ker_dtype>, int>(int d)
#define JITKERNEL_KEY(ker_key, dtype_key) \
#ker_key #dtype_key + std::to_string(d)
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
#define JITKERNEL_NEW_IMPL
_DEPRECATED
(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>(
\
std::make_shared<ker##Impl<dtype, isa, k>>(d))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
#define JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, ker_dtype, \
dtype_key, marco_declare, macro_key, \
macro_impl) \
marco_declare(ker_class, ker_dtype) { \
std::string key = macro_key(ker_key, dtype_key); \
if (kers_.find(key) == kers_.end()) { \
std::shared_ptr<ker_class<ker_dtype>> p; \
SEARCH_ISA_BLOCK(macro_impl, ker_class, ker_dtype); \
kers_.insert({key, std::dynamic_pointer_cast<Kernel>(p)}); \
return p; \
} \
return std::dynamic_pointer_cast<const ker_class<ker_dtype>>( \
kers_.at(key)); \
}
#define REGISTER_JITKERNEL(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, JITKERNEL_DECLARE, \
JITKERNEL_KEY, JITKERNEL_NEW_IMPL)
#define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_declare, macro_key, \
macro_impl) \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, float, f, marco_declare, macro_key, \
macro_impl); \
JITKERNEL_WITH_DTYPE(ker_key, ker_class, double, d, marco_declare, \
macro_key, macro_impl)
#define REGISTER_JITKERNEL_DEPRECATED(ker_key, ker_class) \
JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, \
JITKERNEL_DECLARE, JITKERNEL_KEY, \
JITKERNEL_NEW_IMPL_DEPRECATED); \
JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d, \
JITKERNEL_DECLARE, JITKERNEL_KEY, \
JITKERNEL_NEW_IMPL_DEPRECATED)
#define REGISTER_JITKERNEL_ARGS_DEPRECATED(ker_key, ker_class, marco_declare, \
macro_key, macro_impl) \
JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, float, f, marco_declare, \
macro_key, macro_impl); \
JITKERNEL_WITH_DTYPE_DEPRECATED(ker_key, ker_class, double, d, \
marco_declare, macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \
...
...
paddle/fluid/operators/math/jit_kernel_rnn.cc
浏览文件 @
a412826c
...
...
@@ -179,23 +179,23 @@ class LSTMKernelImpl : public LSTMKernel<T> {
/* C_t = C_t-1 * fgated + cand_gated * igated */
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
private:
...
...
@@ -289,36 +289,36 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
void
ComputeCtHt
(
T
*
gates
,
const
T
*
ct_1
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
,
T
*
checked
)
const
override
{
/* get fgated and igated*/
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
);
vmul_d_
->
Compute
(
wp_data
,
ct_1
,
checked
,
d_
);
vmul_d_
->
Compute
(
wp_data
+
d_
,
ct_1
,
checked
+
d_
,
d_
);
vadd_d2_
->
Compute
(
checked
,
gates
+
d_
,
gates
+
d_
);
act_gate_d2_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
/* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
gates
+
d_
,
d_
);
vmul_d_
->
Compute
(
ct_1
,
gates
+
d2_
,
gates
+
d2_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d2_
,
ct
);
/* get ogated*/
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
void
ComputeC1H1
(
T
*
gates
,
T
*
ct
,
T
*
ht
,
const
T
*
wp_data
)
const
override
{
/* C_t = igated * cgated*/
act_gate_d_
->
Compute
(
gates
+
d_
,
gates
+
d_
);
act_cand_d_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
);
vmul_d_
->
Compute
(
gates
,
gates
+
d_
,
ct
,
d_
);
/* get outgated, put W_oc * C_t on igated */
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
);
vmul_d_
->
Compute
(
wp_data
+
d2_
,
ct
,
gates
+
d_
,
d_
);
vadd_d_
->
Compute
(
gates
+
d_
,
gates
+
d3_
,
gates
+
d3_
);
/* H_t = act_cell(C_t) * ogated */
act_gate_d_
->
Compute
(
gates
+
d3_
,
gates
+
d3_
);
act_cell_d_
->
Compute
(
ct
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
);
vmul_d_
->
Compute
(
gates
+
d2_
,
gates
+
d3_
,
ht
,
d_
);
}
private:
...
...
@@ -352,8 +352,8 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
act_cell, d)); \
}
REGISTER_JITKERNEL_ARGS
(
lstm
,
LSTMKernel
,
JITKERNEL_DECLARE_LSTM
,
JITKERNEL_KEY_LSTM
,
JITKERNEL_NEW_LSTM_IMPL
);
REGISTER_JITKERNEL_ARGS
_DEPRECATED
(
lstm
,
LSTMKernel
,
JITKERNEL_DECLARE_LSTM
,
JITKERNEL_KEY_LSTM
,
JITKERNEL_NEW_LSTM_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_DECLARE_LSTM
...
...
@@ -378,13 +378,13 @@ class GRUKernelImpl : public GRUKernel<T> {
void
ComputeH1
(
T
*
gates
,
T
*
ht
)
const
override
{
act_gate_d_
->
Compute
(
gates
,
gates
);
act_state_d_
->
Compute
(
gates
+
d2_
,
gates
+
d2_
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
);
vmul_d_
->
Compute
(
gates
,
gates
+
d2_
,
ht
,
d_
);
}
void
ComputeHtPart1
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
// W: {W_update, W_reset; W_state}
act_gate_d2_
->
Compute
(
gates
,
gates
);
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
);
vmul_d_
->
Compute
(
ht_1
,
gates
+
d_
,
ht
,
d_
);
}
void
ComputeHtPart2
(
T
*
gates
,
const
T
*
ht_1
,
T
*
ht
)
const
override
{
...
...
@@ -472,8 +472,8 @@ INTRI8_FLOAT(jit::avx512f);
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(act_gate, act_state, d));
REGISTER_JITKERNEL_ARGS
(
gru
,
GRUKernel
,
JITKERNEL_DECLARE_GRU
,
JITKERNEL_KEY_GRU
,
JITKERNEL_NEW_GRU_IMPL
);
REGISTER_JITKERNEL_ARGS
_DEPRECATED
(
gru
,
GRUKernel
,
JITKERNEL_DECLARE_GRU
,
JITKERNEL_KEY_GRU
,
JITKERNEL_NEW_GRU_IMPL
);
#undef INTRI8_FLOAT
#undef JITKERNEL_NEW_GRU_IMPL
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
a412826c
...
...
@@ -369,12 +369,12 @@ void lstm_ctht_better(
int
d2
=
d
*
2
;
vsigmoid_3d
->
Compute
(
gates
+
d
,
gates
+
d
);
vtanh_d
->
Compute
(
gates
,
gates
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
,
gates
+
d
,
gates
+
d
,
d
);
vmul_d
->
Compute
(
ct_1
,
gates
+
d2
,
gates
+
d2
,
d
);
vadd_d
->
Compute
(
gates
+
d
,
gates
+
d2
,
ct
);
/* H_t = act_cell(C_t) * ogated */
vtanh_d
->
Compute
(
ct
,
gates
+
d2
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
);
vmul_d
->
Compute
(
gates
+
d2
,
gates
+
d
*
3
,
ht
,
d
);
}
TEST
(
JitKernel
,
lstm
)
{
...
...
@@ -578,7 +578,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) {
TEST
(
JitKernel
,
vmul
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
256
,
512
,
1000
,
1024
})
{
std
::
vector
<
float
>
x
(
d
),
y
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
());
...
...
@@ -616,7 +616,7 @@ TEST(JitKernel, vmul) {
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
,
d
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -800,8 +800,8 @@ TEST(JitKernel, pool) {
EXPECT_TRUE
(
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_f
)
!=
std
::
dynamic_pointer_cast
<
const
jit
::
Kernel
>
(
pvmul_d
));
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf
4
"
);
const
auto
&
pvmul_from_key
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf
any
"
);
EXPECT_EQ
(
pvmul_f
,
pvmul_from_key
);
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf
5
"
);
const
auto
&
pvmul_from_key2
=
jit
::
KernelPool
::
Instance
().
Get
(
"vmulf
jit
"
);
EXPECT_TRUE
(
pvmul_from_key2
==
nullptr
);
}
paddle/fluid/operators/math/pooling.cc
浏览文件 @
a412826c
...
...
@@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -68,7 +68,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
);
}
}
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
ph
*
output_width
+
pw
]
=
ele
;
}
...
...
@@ -93,7 +94,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_height
=
input
.
dims
()[
2
];
const
int
input_width
=
input
.
dims
()[
3
];
...
...
@@ -124,7 +125,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
int
wstart
=
pw
*
stride_width
-
padding_width
;
int
wend
=
std
::
min
(
wstart
+
ksize_width
,
input_width
);
wstart
=
std
::
max
(
wstart
,
0
);
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
float
scale
=
1.0
/
pool_size
;
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
for
(
int
w
=
wstart
;
w
<
wend
;
++
w
)
{
...
...
@@ -249,7 +251,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -300,7 +302,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
}
}
int
pool_size
=
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
);
exclusive
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
output_idx
]
=
ele
;
}
...
...
@@ -326,7 +330,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
output
,
const
framework
::
Tensor
&
output_grad
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_grad_process
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_depth
=
input
.
dims
()[
2
];
const
int
input_height
=
input
.
dims
()[
3
];
...
...
@@ -369,7 +373,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
wstart
=
std
::
max
(
wstart
,
0
);
int
pool_size
=
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
);
exclusive
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
float
scale
=
1.0
/
pool_size
;
for
(
int
d
=
dstart
;
d
<
dend
;
++
d
)
{
for
(
int
h
=
hstart
;
h
<
hend
;
++
h
)
{
...
...
paddle/fluid/operators/math/pooling.cu
浏览文件 @
a412826c
...
...
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
T
*
output_data
)
{
bool
exclusive
,
T
*
output_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -52,7 +52,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
pool_process
.
compute
(
input_data
[
h
*
input_width
+
w
],
&
ele
);
}
}
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
index
]
=
ele
;
}
...
...
@@ -65,7 +66,7 @@ __global__ void KernelPool2DGrad(
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
T
*
input_grad
)
{
PoolProcess
pool_process
,
bool
exclusive
,
T
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
offsetW
=
index
%
input_width
+
padding_width
;
...
...
@@ -95,7 +96,8 @@ __global__ void KernelPool2DGrad(
int
wend
=
min
(
wstart
+
ksize_width
,
input_width
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
int
pool_size
=
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_height
*
ksize_width
;
int
output_sub_idx
=
ph
*
output_width
+
pw
;
pool_process
.
compute
(
input
,
output_data
[
output_sub_idx
],
output_grad
[
output_sub_idx
],
...
...
@@ -163,7 +165,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -189,7 +191,8 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
KernelPool2D
<
PoolProcess
,
T
><<<
grid
,
threads
,
0
,
context
.
stream
()
>>>
(
nthreads
,
input_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_process
,
output_data
);
stride_width
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
output_data
);
}
};
...
...
@@ -208,7 +211,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_height
=
input
.
dims
()[
2
];
...
...
@@ -236,7 +239,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
nthreads
,
input_data
,
output_data
,
output_grad_data
,
input_channels
,
input_height
,
input_width
,
output_height
,
output_width
,
ksize_height
,
ksize_width
,
stride_height
,
stride_width
,
padding_height
,
padding_width
,
pool_process
,
input_grad_data
);
pool_process
,
exclusive
,
input_grad_data
);
}
};
...
...
@@ -313,16 +316,14 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
double
>
;
template
<
typename
PoolProcess
,
typename
T
>
__global__
void
KernelPool3D
(
const
int
nthreads
,
const
T
*
input_data
,
const
int
channels
,
const
int
input_depth
,
const
int
input_height
,
const
int
input_width
,
const
int
output_depth
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_depth
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
T
*
output_data
)
{
__global__
void
KernelPool3D
(
const
int
nthreads
,
const
T
*
input_data
,
const
int
channels
,
const
int
input_depth
,
const
int
input_height
,
const
int
input_width
,
const
int
output_depth
,
const
int
output_height
,
const
int
output_width
,
const
int
ksize_depth
,
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
bool
exclusive
,
T
*
output_data
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
pw
=
index
%
output_width
;
...
...
@@ -351,7 +352,9 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
}
}
}
int
pool_size
=
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
pool_process
.
finalize
(
static_cast
<
T
>
(
pool_size
),
&
ele
);
output_data
[
index
]
=
ele
;
}
...
...
@@ -366,7 +369,7 @@ __global__ void KernelPool3DGrad(
const
int
ksize_height
,
const
int
ksize_width
,
const
int
stride_depth
,
const
int
stride_height
,
const
int
stride_width
,
const
int
padding_depth
,
const
int
padding_height
,
const
int
padding_width
,
PoolProcess
pool_process
,
T
*
input_grad
)
{
bool
exclusive
,
T
*
input_grad
)
{
for
(
int
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
index
<
nthreads
;
index
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
offsetW
=
index
%
input_width
+
padding_width
;
...
...
@@ -409,7 +412,9 @@ __global__ void KernelPool3DGrad(
dstart
=
max
(
dstart
,
0
);
hstart
=
max
(
hstart
,
0
);
wstart
=
max
(
wstart
,
0
);
int
pool_size
=
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
);
int
pool_size
=
exclusive
?
(
dend
-
dstart
)
*
(
hend
-
hstart
)
*
(
wend
-
wstart
)
:
ksize_depth
*
ksize_height
*
ksize_width
;
int
output_sub_idx
=
(
pd
*
output_height
+
ph
)
*
output_width
+
pw
;
pool_process
.
compute
(
input
,
output_data
[
output_sub_idx
],
output_grad
[
output_sub_idx
],
...
...
@@ -484,7 +489,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
output
)
{
bool
exclusive
,
framework
::
Tensor
*
output
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -517,7 +522,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
nthreads
,
input_data
,
input_channels
,
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
exclusive
,
output_data
);
}
};
...
...
@@ -537,7 +542,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_process
,
framework
::
Tensor
*
input_grad
)
{
bool
exclusive
,
framework
::
Tensor
*
input_grad
)
{
const
int
batch_size
=
input
.
dims
()[
0
];
const
int
input_channels
=
input
.
dims
()[
1
];
const
int
input_depth
=
input
.
dims
()[
2
];
...
...
@@ -573,7 +578,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
ksize_depth
,
ksize_height
,
ksize_width
,
stride_depth
,
stride_height
,
stride_width
,
padding_depth
,
padding_height
,
padding_width
,
pool_process
,
input_grad_data
);
padding_width
,
pool_process
,
exclusive
,
input_grad_data
);
}
};
...
...
paddle/fluid/operators/math/pooling.h
浏览文件 @
a412826c
...
...
@@ -89,7 +89,7 @@ class Pool2dFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
framework
::
Tensor
*
output
);
bool
exclusive
,
framework
::
Tensor
*
output
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -101,7 +101,7 @@ class Pool2dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
framework
::
Tensor
*
input_grad
);
bool
exclusive
,
framework
::
Tensor
*
input_grad
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
@@ -123,7 +123,7 @@ class Pool3dFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
framework
::
Tensor
*
output
);
bool
exclusive
,
framework
::
Tensor
*
output
);
};
template
<
typename
DeviceContext
,
typename
PoolProcess
,
typename
T
>
...
...
@@ -135,7 +135,7 @@ class Pool3dGradFunctor {
const
std
::
vector
<
int
>&
ksize
,
const
std
::
vector
<
int
>&
strides
,
const
std
::
vector
<
int
>&
paddings
,
PoolProcess
pool_compute
,
framework
::
Tensor
*
input_grad
);
bool
exclusive
,
framework
::
Tensor
*
input_grad
);
};
template
<
typename
DeviceContext
,
class
T
>
...
...
paddle/fluid/operators/math/sequence_pooling.cc
浏览文件 @
a412826c
...
...
@@ -31,7 +31,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
template
<
typename
T
,
bool
is_test
>
class
MaxSeqPoolFunctor
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
...
...
@@ -70,7 +70,41 @@ class MaxSeqPoolFunctor {
}
}
};
// Instantisation of Max Sequence Pooling for test phase eg. no need to fill
// index buffer
template
<
typename
T
>
class
MaxSeqPoolFunctor
<
T
,
true
>
{
public:
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
index
)
{
auto
in_dims
=
input
.
dims
();
auto
out_dims
=
output
->
dims
();
PADDLE_ENFORCE_GT
(
in_dims
.
size
(),
1
);
PADDLE_ENFORCE_GT
(
out_dims
.
size
(),
1
);
for
(
int64_t
i
=
1
;
i
<
in_dims
.
size
();
++
i
)
{
PADDLE_ENFORCE_EQ
(
in_dims
[
i
],
out_dims
[
i
]);
}
auto
starts
=
input
.
lod
()[
0
];
const
T
*
in_data
=
input
.
data
<
T
>
();
T
*
out_data
=
output
->
data
<
T
>
();
int64_t
num_seq
=
out_dims
[
0
];
int64_t
dim
=
output
->
numel
()
/
num_seq
;
for
(
int64_t
i
=
0
;
i
<
num_seq
;
++
i
)
{
std
::
memcpy
(
&
out_data
[
i
*
dim
],
&
in_data
[
starts
[
i
]
*
dim
],
dim
*
sizeof
(
T
));
for
(
size_t
j
=
starts
[
i
]
+
1
;
j
<
starts
[
i
+
1
];
++
j
)
{
for
(
int64_t
k
=
0
;
k
<
dim
;
++
k
)
{
if
(
in_data
[
j
*
dim
+
k
]
>
out_data
[
i
*
dim
+
k
])
{
out_data
[
i
*
dim
+
k
]
=
in_data
[
j
*
dim
+
k
];
}
}
}
}
}
};
template
<
typename
T
>
class
MaxSeqPoolGradFunctor
{
public:
...
...
@@ -188,11 +222,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
/* max pool has index output */
void
operator
()(
const
platform
::
CPUDeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output
,
bool
is_test
,
framework
::
Tensor
*
index
=
nullptr
)
{
if
(
pooltype
==
"MAX"
)
{
math
::
MaxSeqPoolFunctor
<
T
>
max_pool
;
max_pool
(
context
,
input
,
output
,
index
);
if
(
is_test
)
{
math
::
MaxSeqPoolFunctor
<
T
,
true
>
max_pool
;
max_pool
(
context
,
input
,
output
,
index
);
}
else
{
math
::
MaxSeqPoolFunctor
<
T
,
false
>
max_pool
;
max_pool
(
context
,
input
,
output
,
index
);
}
return
;
}
if
(
pooltype
==
"LAST"
)
{
...
...
@@ -200,6 +239,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
last_pool
(
context
,
input
,
output
);
return
;
}
if
(
pooltype
==
"FIRST"
)
{
math
::
FirstSeqPoolFunctor
<
T
>
first_pool
;
first_pool
(
context
,
input
,
output
);
...
...
paddle/fluid/operators/math/sequence_pooling.cu
浏览文件 @
a412826c
...
...
@@ -133,7 +133,7 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
public:
void
operator
()(
const
platform
::
CUDADeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
output
,
bool
is_test
,
framework
::
Tensor
*
index
=
nullptr
)
{
auto
&
lod
=
input
.
lod
()[
0
];
const
size_t
item_dim
=
output
->
numel
()
/
output
->
dims
()[
0
];
...
...
paddle/fluid/operators/math/sequence_pooling.h
浏览文件 @
a412826c
...
...
@@ -28,7 +28,7 @@ class SequencePoolFunctor {
/* max pool has index output */
void
operator
()(
const
DeviceContext
&
context
,
const
std
::
string
pooltype
,
const
framework
::
LoDTensor
&
input
,
framework
::
Tensor
*
output
,
framework
::
Tensor
*
index
=
nullptr
);
bool
is_test
=
false
,
framework
::
Tensor
*
index
=
nullptr
);
};
template
<
typename
DeviceContext
,
typename
T
>
...
...
paddle/fluid/operators/pool_cudnn_op.cu.cc
浏览文件 @
a412826c
...
...
@@ -41,6 +41,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
T
*
output_data
=
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
string
pooling_type
=
ctx
.
Attr
<
std
::
string
>
(
"pooling_type"
);
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
std
::
vector
<
int
>
ksize
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
...
...
@@ -72,7 +73,8 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
if
(
pooling_type
==
"max"
)
{
pooling_mode
=
PoolingMode
::
kMaximum
;
}
else
{
pooling_mode
=
PoolingMode
::
kAverage
;
pooling_mode
=
exclusive
?
PoolingMode
::
kAverageExclusive
:
PoolingMode
::
kAverageInclusive
;
}
cudnnPoolingDescriptor_t
cudnn_pool_desc
=
...
...
@@ -101,6 +103,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
std
::
string
pooling_type
=
ctx
.
Attr
<
std
::
string
>
(
"pooling_type"
);
bool
exclusive
=
ctx
.
Attr
<
bool
>
(
"exclusive"
);
std
::
vector
<
int
>
ksize
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
...
...
@@ -141,7 +144,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
pooling_mode
=
PoolingMode
::
kMaximum
;
}
}
else
{
pooling_mode
=
PoolingMode
::
kAverage
;
pooling_mode
=
exclusive
?
PoolingMode
::
kAverageExclusive
:
PoolingMode
::
kAverageInclusive
;
}
cudnnPoolingDescriptor_t
cudnn_pool_desc
=
...
...
paddle/fluid/operators/pool_op.cc
浏览文件 @
a412826c
...
...
@@ -180,6 +180,12 @@ void Pool2dOpMaker::Make() {
"operator."
"If global_pooling = true, paddings and ksize will be ignored."
)
.
SetDefault
({
0
,
0
});
AddAttr
<
bool
>
(
"exclusive"
,
"(bool, default True) When true, will exclude the zero-padding in the "
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"use_cudnn"
,
"(bool, default false) Only used in cudnn kernel, need install cudnn"
)
...
...
@@ -236,6 +242,23 @@ Example:
W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$
For exclusive = true:
$$
hstart = i * strides[0] - paddings[0]
hend = hstart + ksize[0]
wstart = j * strides[1] - paddings[1]
wend = wstart + ksize[1]
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{ksize[0] * ksize[1]}
$$
For exclusive = false:
$$
hstart = max(0, i * strides[0] - paddings[0])
hend = min(H, hstart + ksize[0])
wstart = max(0, j * strides[1] - paddings[1])
wend = min(W, wstart + ksize[1])
Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
$$
)DOC"
);
}
...
...
@@ -283,6 +306,12 @@ void Pool3dOpMaker::Make() {
"If global_pooling = true, ksize and paddings will be ignored."
)
.
SetDefault
({
0
,
0
,
0
});
// TODO(Chengduo): Add checker. (Currently,
// TypedAttrChecker don't support vector type.)
AddAttr
<
bool
>
(
"exclusive"
,
"(bool, default True) When true, will exclude the zero-padding in the "
"averaging calculating, otherwise, include the zero-padding. Note, it "
"is only used when pooling_type is avg. The defalut is True."
)
.
SetDefault
(
true
);
AddAttr
<
bool
>
(
"use_cudnn"
,
...
...
paddle/fluid/operators/pool_op.h
浏览文件 @
a412826c
...
...
@@ -69,6 +69,7 @@ class PoolKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
ksize
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
exclusive
=
context
.
Attr
<
bool
>
(
"exclusive"
);
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
paddings
[
i
]
=
0
;
...
...
@@ -84,7 +85,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
true
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
paddle
::
operators
::
math
::
Pool2dFunctor
<
...
...
@@ -92,7 +93,7 @@ class PoolKernel : public framework::OpKernel<T> {
pool2d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool2d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
exclusive
,
out
);
}
}
break
;
case
3
:
{
...
...
@@ -102,14 +103,14 @@ class PoolKernel : public framework::OpKernel<T> {
pool3d_forward
;
paddle
::
operators
::
math
::
MaxPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
true
,
out
);
}
else
if
(
pooling_type
==
"avg"
)
{
paddle
::
operators
::
math
::
Pool3dFunctor
<
DeviceContext
,
paddle
::
operators
::
math
::
AvgPool
<
T
>
,
T
>
pool3d_forward
;
paddle
::
operators
::
math
::
AvgPool
<
T
>
pool_process
;
pool3d_forward
(
dev_ctx
,
*
in_x
,
ksize
,
strides
,
paddings
,
pool_process
,
out
);
exclusive
,
out
);
}
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
...
...
@@ -131,6 +132,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
std
::
vector
<
int
>
ksize
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"ksize"
);
std
::
vector
<
int
>
strides
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
bool
exclusive
=
context
.
Attr
<
bool
>
(
"exclusive"
);
if
(
context
.
Attr
<
bool
>
(
"global_pooling"
))
{
for
(
size_t
i
=
0
;
i
<
ksize
.
size
();
++
i
)
{
...
...
@@ -157,7 +159,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool2d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool2d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
pool_process
,
in_x_grad
);
paddings
,
pool_process
,
exclusive
,
in_x_grad
);
}
}
break
;
case
3
:
{
...
...
@@ -172,7 +174,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
pool3d_backward
;
paddle
::
operators
::
math
::
AvgPoolGrad
<
T
>
pool_process
;
pool3d_backward
(
dev_ctx
,
*
in_x
,
*
out
,
*
out_grad
,
ksize
,
strides
,
paddings
,
pool_process
,
in_x_grad
);
paddings
,
pool_process
,
exclusive
,
in_x_grad
);
}
}
break
;
default:
{
PADDLE_THROW
(
"Pool op only supports 2D and 3D input."
);
}
...
...
paddle/fluid/operators/sequence_pool_op.cc
浏览文件 @
a412826c
...
...
@@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor<int>) This tensor is used for the sequence max-pooling "
"to record the max indexes."
)
.
AsIntermediate
();
AddAttr
<
bool
>
(
"is_test"
,
""
).
SetDefault
(
false
);
AddAttr
<
std
::
string
>
(
"pooltype"
,
"(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp."
)
...
...
paddle/fluid/operators/sequence_pool_op.h
浏览文件 @
a412826c
...
...
@@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
auto
*
in
=
context
.
Input
<
LoDTensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
std
::
string
pooltype
=
context
.
Attr
<
std
::
string
>
(
"pooltype"
);
Tensor
*
index
=
nullptr
;
if
(
pooltype
==
"MAX"
)
{
index
=
context
.
Output
<
Tensor
>
(
"MaxIndex"
);
}
auto
dims
=
in
->
dims
();
auto
lod
=
in
->
lod
();
...
...
@@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel<T> {
dims
[
0
]
=
lod
[
0
].
size
()
-
1
;
out
->
Resize
({
dims
});
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
if
(
pooltype
==
"MAX"
)
{
Tensor
*
index
=
nullptr
;
const
bool
is_test
=
context
.
Attr
<
bool
>
(
"is_test"
);
// Do not create index buffer for inference (is_test) mode
// TODO(jczaja): Skip index buffer creation for other devices eg. GPU
if
(
pooltype
==
"MAX"
&&
(
is_test
==
false
||
platform
::
is_cpu_place
(
context
.
GetPlace
())
==
false
))
{
index
=
context
.
Output
<
Tensor
>
(
"MaxIndex"
);
index
->
Resize
({
dims
});
index
->
mutable_data
<
int
>
(
context
.
GetPlace
());
}
math
::
SequencePoolFunctor
<
DeviceContext
,
T
>
pool
;
pool
(
context
.
template
device_context
<
DeviceContext
>(),
pooltype
,
*
in
,
out
,
index
);
i
s_test
,
i
ndex
);
}
};
...
...
paddle/fluid/operators/sign_op.cc
浏览文件 @
a412826c
...
...
@@ -67,4 +67,5 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
sign
,
ops
::
SignOp
,
ops
::
SignOpMaker
<
float
>
,
ops
::
SignGradMaker
);
REGISTER_OP_CPU_KERNEL
(
sign
,
ops
::
SignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
sign
,
ops
::
SignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
SignKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
);
paddle/fluid/operators/sign_op.cu
浏览文件 @
a412826c
...
...
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sign_op.h"
#include "paddle/fluid/platform/float16.h"
REGISTER_OP_CUDA_KERNEL
(
sign
,
paddle
::
operators
::
SignKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
paddle
::
operators
::
SignKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
paddle
::
operators
::
SignKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
paddle
::
operators
::
SignKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
);
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
浏览文件 @
a412826c
...
...
@@ -44,6 +44,12 @@ class SoftmaxWithCrossEntropyOpMaker
"(bool, default: false), A flag to indicate whether to interpretate "
"the given labels as soft labels."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"numeric_stable_mode"
,
"(bool, default: false), A flag to indicate whether to use more "
"numerically stable algorithm. This flag is only valid when "
"soft_label is false and GPU is used."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"ignore_index"
,
"(int, default -100), Specifies a target value that is ignored and"
...
...
paddle/fluid/operators/softmax_with_cross_entropy_op.cu
浏览文件 @
a412826c
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <cub/cub.cuh>
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
#include "paddle/fluid/platform/for_range.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -117,8 +118,8 @@ using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
// Make sure that BlockDim <= feature_size
// This kernel is used to calculate the max element of each row
template
<
typename
T
,
int
BlockDim
>
__global__
void
RowReductionForMax
(
const
T
*
logits_data
,
T
*
max_data
,
int
feature_size
)
{
static
__global__
void
RowReductionForMax
(
const
T
*
logits_data
,
T
*
max_data
,
int
feature_size
)
{
__shared__
BlockReduceTempStorage
<
T
,
BlockDim
>
temp_storage
;
auto
beg_idx
=
feature_size
*
blockIdx
.
x
+
threadIdx
.
x
;
...
...
@@ -141,9 +142,10 @@ __global__ void RowReductionForMax(const T* logits_data, T* max_data,
}
// Make sure that BlockDim <= feature_size
template
<
typename
T
,
int
BlockDim
>
__global__
void
RowReductionForDiffMaxSum
(
const
T
*
logits_data
,
T
*
max_data
,
T
*
softmax
,
int
feature_size
)
{
template
<
typename
T
,
int
BlockDim
,
bool
CalculateLogSoftmax
=
false
>
static
__global__
void
RowReductionForDiffMaxSum
(
const
T
*
logits_data
,
T
*
max_data
,
T
*
softmax
,
int
feature_size
)
{
__shared__
BlockReduceTempStorage
<
T
,
BlockDim
>
temp_storage
;
auto
beg_idx
=
feature_size
*
blockIdx
.
x
+
threadIdx
.
x
;
...
...
@@ -153,24 +155,34 @@ __global__ void RowReductionForDiffMaxSum(const T* logits_data, T* max_data,
softmax
[
beg_idx
]
=
logits_data
[
beg_idx
]
-
block_max
;
T
diff_max_sum
=
real_exp
(
softmax
[
beg_idx
]);
beg_idx
+=
BlockDim
;
while
(
beg_
idx
<
end_idx
)
{
softmax
[
beg_idx
]
=
logits_data
[
beg_
idx
]
-
block_max
;
diff_max_sum
+=
real_exp
(
softmax
[
beg_
idx
]);
beg_
idx
+=
BlockDim
;
auto
idx
=
beg_idx
+
BlockDim
;
while
(
idx
<
end_idx
)
{
softmax
[
idx
]
=
logits_data
[
idx
]
-
block_max
;
diff_max_sum
+=
real_exp
(
softmax
[
idx
]);
idx
+=
BlockDim
;
}
diff_max_sum
=
BlockReduce
<
T
,
BlockDim
>
(
temp_storage
).
Reduce
(
diff_max_sum
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
real_log
(
diff_max_sum
);
if
(
!
CalculateLogSoftmax
)
return
;
__syncthreads
();
diff_max_sum
=
max_data
[
blockIdx
.
x
];
softmax
[
beg_idx
]
-=
diff_max_sum
;
beg_idx
+=
BlockDim
;
while
(
beg_idx
<
end_idx
)
{
softmax
[
beg_idx
]
-=
diff_max_sum
;
beg_idx
+=
BlockDim
;
}
if
(
threadIdx
.
x
==
0
)
max_data
[
blockIdx
.
x
]
=
0
;
}
// Make sure that BlockDim <= feature_size
template
<
typename
T
,
int
BlockDim
>
__global__
void
RowReductionForSoftmaxAndCrossEntropy
(
const
T
*
logits_data
,
const
T
*
labels_data
,
T
*
loss_data
,
T
*
softmax
,
int
feature_size
)
{
static
__global__
void
RowReductionForSoftmaxAndCrossEntropy
(
const
T
*
logits_data
,
const
T
*
labels_data
,
T
*
loss_data
,
T
*
softmax
,
int
feature_size
)
{
__shared__
BlockReduceTempStorage
<
T
,
BlockDim
>
temp_storage
;
auto
beg_idx
=
feature_size
*
blockIdx
.
x
+
threadIdx
.
x
;
...
...
@@ -194,11 +206,134 @@ __global__ void RowReductionForSoftmaxAndCrossEntropy(const T* logits_data,
}
template
<
typename
T
>
__global__
void
SetSoftmaxToOneWhenFeatureSizeIsOne
(
T
*
out
,
int
batch_size
)
{
struct
HardLabelSoftmaxWithCrossEntropyFunctor
{
public:
HardLabelSoftmaxWithCrossEntropyFunctor
(
const
T
*
logits
,
const
int64_t
*
labels
,
T
*
loss
,
T
*
log_softmax
,
int
feature_size
)
:
logits_
(
logits
),
labels_
(
labels
),
loss_
(
loss
),
log_softmax_
(
log_softmax
),
feature_size_
(
feature_size
)
{}
__device__
void
operator
()(
int
idx
)
const
{
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
])
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
}
}
private:
const
T
*
logits_
;
const
int64_t
*
labels_
;
T
*
loss_
;
T
*
log_softmax_
;
int
feature_size_
;
};
template
<
typename
T
>
struct
HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx
{
public:
HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx
(
const
T
*
logits
,
const
int64_t
*
labels
,
T
*
loss
,
T
*
log_softmax
,
int
feature_size
,
int
ignore_idx
)
:
logits_
(
logits
),
labels_
(
labels
),
loss_
(
loss
),
log_softmax_
(
log_softmax
),
feature_size_
(
feature_size
),
ignore_idx_
(
ignore_idx
)
{}
__device__
void
operator
()(
int
idx
)
const
{
auto
row_idx
=
idx
/
feature_size_
;
auto
col_idx
=
idx
%
feature_size_
;
if
(
col_idx
!=
labels_
[
row_idx
]
||
col_idx
==
ignore_idx_
)
{
log_softmax_
[
idx
]
=
real_exp
(
log_softmax_
[
idx
]);
}
else
{
auto
softmax
=
log_softmax_
[
idx
];
log_softmax_
[
idx
]
=
real_exp
(
softmax
);
loss_
[
row_idx
]
=
-
softmax
;
}
}
private:
const
T
*
logits_
;
const
int64_t
*
labels_
;
T
*
loss_
;
T
*
log_softmax_
;
int
feature_size_
;
int
ignore_idx_
;
};
template
<
typename
T
>
static
__global__
void
SetSoftmaxToOneWhenFeatureSizeIsOne
(
T
*
out
,
int
batch_size
)
{
auto
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
batch_size
)
out
[
idx
]
=
static_cast
<
T
>
(
1
);
}
template
<
typename
T
>
static
void
HardLabelSoftmaxWithCrossEntropy
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
T
*
logits_data
,
const
int64_t
*
labels_data
,
T
*
loss_data
,
T
*
softmax_data
,
int
batch_size
,
int
feature_size
,
int
ignore_idx
)
{
constexpr
int
kMaxBlockDim
=
512
;
int
block_dim
=
feature_size
>=
kMaxBlockDim
?
kMaxBlockDim
:
(
1
<<
static_cast
<
int
>
(
std
::
log2
(
feature_size
)));
auto
stream
=
ctx
.
stream
();
#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim) \
case BlockDim: { \
RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, feature_size); \
RowReductionForDiffMaxSum<T, BlockDim, \
true><<<batch_size, BlockDim, 0, stream>>>( \
logits_data, loss_data, softmax_data, feature_size); \
platform::ForRange<platform::CUDADeviceContext> for_range( \
ctx, batch_size* feature_size); \
if (ignore_idx >= 0 && ignore_idx < feature_size) { \
for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>( \
logits_data, labels_data, loss_data, softmax_data, feature_size, \
ignore_idx)); \
} else { \
for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>( \
logits_data, labels_data, loss_data, softmax_data, feature_size)); \
} \
} break
switch
(
block_dim
)
{
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
512
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
256
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
128
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
64
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
32
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
16
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
8
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
4
);
CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
(
2
);
case
1
:
SetSoftmaxToOneWhenFeatureSizeIsOne
<<<
(
batch_size
+
kMaxBlockDim
-
1
)
/
kMaxBlockDim
,
kMaxBlockDim
,
0
,
stream
>>>
(
softmax_data
,
batch_size
);
cudaMemsetAsync
(
loss_data
,
0
,
batch_size
*
sizeof
(
T
),
stream
);
break
;
default:
PADDLE_THROW
(
"BlockDim must be 2^n in softmax_with_cross_entropy_op"
);
break
;
}
#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
}
template
<
typename
T
>
static
void
SoftmaxWithCrossEntropyFusedKernel
(
const
T
*
logits_data
,
const
T
*
labels_data
,
...
...
@@ -237,7 +372,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
kMaxBlockDim
,
kMaxBlockDim
,
0
,
stream
>>>
(
softmax_data
,
batch_size
);
cudaMemsetAsync
(
loss_data
,
0
,
batch_size
,
stream
);
cudaMemsetAsync
(
loss_data
,
0
,
batch_size
*
sizeof
(
T
)
,
stream
);
break
;
default:
PADDLE_THROW
(
"BlockDim must be 2^n in softmax_with_cross_entropy_op"
);
...
...
@@ -272,11 +407,21 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
logits_data
,
labels_data
,
softmax_data
,
loss_data
,
batch_size
,
feature_size
,
context
.
cuda_device_context
().
stream
());
}
else
{
math
::
SoftmaxCUDNNFunctor
<
T
>
()(
context
.
cuda_device_context
(),
logits
,
softmax
);
math
::
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
T
>
()(
context
.
cuda_device_context
(),
loss
,
softmax
,
labels
,
false
,
ignore_index
);
if
(
!
context
.
Attr
<
bool
>
(
"numeric_stable_mode"
))
{
math
::
SoftmaxCUDNNFunctor
<
T
>
()(
context
.
cuda_device_context
(),
logits
,
softmax
);
math
::
CrossEntropyFunctor
<
platform
::
CUDADeviceContext
,
T
>
()(
context
.
cuda_device_context
(),
loss
,
softmax
,
labels
,
false
,
ignore_index
);
}
else
{
int
batch_size
=
logits
->
dims
()[
0
];
int
feature_size
=
logits
->
dims
()[
1
];
auto
*
logits_data
=
logits
->
data
<
T
>
();
auto
*
labels_data
=
labels
->
data
<
int64_t
>
();
HardLabelSoftmaxWithCrossEntropy
<
T
>
(
context
.
cuda_device_context
(),
logits_data
,
labels_data
,
loss_data
,
softmax_data
,
batch_size
,
feature_size
,
ignore_index
);
}
}
}
};
...
...
paddle/fluid/operators/spp_op.h
浏览文件 @
a412826c
...
...
@@ -56,12 +56,14 @@ class SppKernel : public framework::OpKernel<T> {
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
MaxPool
<
T
>
,
T
>
pool_forward
;
math
::
MaxPool
<
T
>
max_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
max_process
,
&
out_level
);
kernel_size
,
strides
,
paddings
,
max_process
,
true
,
&
out_level
);
}
else
if
(
pooling_type
==
"avg"
)
{
math
::
Pool2dFunctor
<
DeviceContext
,
math
::
AvgPool
<
T
>
,
T
>
pool_forward
;
math
::
AvgPool
<
T
>
avg_process
;
pool_forward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
kernel_size
,
strides
,
paddings
,
avg_process
,
&
out_level
);
kernel_size
,
strides
,
paddings
,
avg_process
,
true
,
&
out_level
);
}
// flatten pooling output shape
int
output_flatten_w
=
in_x
->
dims
()[
1
]
*
bins
*
bins
;
...
...
@@ -154,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
math
::
AvgPoolGrad
<
T
>
avg_process
;
pool_backward
(
context
.
template
device_context
<
DeviceContext
>(),
*
in_x
,
*&
out_level
,
*&
outgrad_level
,
kernel_size
,
strides
,
paddings
,
avg_process
,
in_x_grad
);
paddings
,
avg_process
,
true
,
in_x_grad
);
}
}
}
...
...
paddle/fluid/operators/sum_op.cc
浏览文件 @
a412826c
...
...
@@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel {
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
x_vars
=
ctx
.
MultiInputVar
(
"X"
);
auto
x_vars_name
=
ctx
.
Inputs
(
"X"
);
framework
::
LibraryType
library
{
framework
::
LibraryType
::
kPlain
};
framework
::
DataLayout
layout
{
framework
::
DataLayout
::
kAnyLayout
};
...
...
@@ -81,10 +82,11 @@ class SumOp : public framework::OperatorWithKernel {
if
(
x_vars
[
0
]
->
IsType
<
framework
::
LoDTensor
>
())
{
int
dtype
=
-
1
;
for
(
auto
&
x_var
:
x_vars
)
{
for
(
size_t
idx
=
0
;
idx
<
x_vars
.
size
();
++
idx
)
{
PADDLE_ENFORCE
(
x_vars
[
idx
]
!=
nullptr
,
"Input var[%s] should not be nullptr"
,
x_vars_name
[
idx
]);
// FIXME(zcd): The input x_var may be SelectedRows or LoDTensor.
auto
tensor
=
framework
::
GetTensorFromVar
(
const_cast
<
framework
::
Variable
*>
(
x_var
));
auto
tensor
=
framework
::
GetTensorFromVar
(
*
x_vars
[
idx
]);
if
(
tensor
->
numel
()
==
0
)
{
continue
;
}
...
...
paddle/fluid/platform/cudnn_helper.h
浏览文件 @
a412826c
...
...
@@ -76,8 +76,9 @@ enum class DataLayout { // Not use
enum
class
PoolingMode
{
kMaximum
,
kAverage
,
kMaximumDeterministic
,
kAverageExclusive
,
kAverageInclusive
,
};
#if CUDNN_VERSION < 6000
...
...
@@ -91,8 +92,10 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
switch
(
mode
)
{
case
PoolingMode
::
kMaximumDeterministic
:
return
CUDNN_POOLING_MAX
;
case
PoolingMode
::
kAverage
:
case
PoolingMode
::
kAverage
Exclusive
:
return
CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
;
case
PoolingMode
::
kAverageInclusive
:
return
CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
;
case
PoolingMode
::
kMaximum
:
return
CUDNN_POOLING_MAX
;
default:
...
...
@@ -105,8 +108,10 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
switch
(
mode
)
{
case
PoolingMode
::
kMaximumDeterministic
:
return
CUDNN_POOLING_MAX_DETERMINISTIC
;
case
PoolingMode
::
kAverage
:
case
PoolingMode
::
kAverage
Exclusive
:
return
CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
;
case
PoolingMode
::
kAverageInclusive
:
return
CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING
;
case
PoolingMode
::
kMaximum
:
return
CUDNN_POOLING_MAX
;
default:
...
...
@@ -341,6 +346,28 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN
(
ScopedPoolingDescriptor
);
};
class
ScopedSpatialTransformerDescriptor
{
public:
ScopedSpatialTransformerDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnCreateSpatialTransformerDescriptor
(
&
desc_
));
}
~
ScopedSpatialTransformerDescriptor
()
{
PADDLE_ENFORCE
(
dynload
::
cudnnDestroySpatialTransformerDescriptor
(
desc_
));
}
template
<
typename
T
>
inline
cudnnSpatialTransformerDescriptor_t
descriptor
(
const
int
nbDims
,
const
int
dimA
[])
{
PADDLE_ENFORCE
(
dynload
::
cudnnSetSpatialTransformerNdDescriptor
(
desc_
,
CUDNN_SAMPLER_BILINEAR
,
CudnnDataType
<
T
>::
type
,
nbDims
,
dimA
));
return
desc_
;
}
private:
cudnnSpatialTransformerDescriptor_t
desc_
;
DISABLE_COPY_AND_ASSIGN
(
ScopedSpatialTransformerDescriptor
);
};
inline
bool
CanCUDNNBeUsed
(
const
framework
::
ExecutionContext
&
ctx
)
{
bool
use_cudnn
=
ctx
.
Attr
<
bool
>
(
"use_cudnn"
);
use_cudnn
&=
paddle
::
platform
::
is_gpu_place
(
ctx
.
GetPlace
());
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
a412826c
...
...
@@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
"'Place' is not supported, Please re-compile with WITH_GPU "
"option"
);
}
return
it
->
second
.
get
();
return
it
->
second
.
get
()
.
get
()
;
}
const
std
::
vector
<
const
DeviceContext
*>
DeviceContextPool
::
GetAllDeviceContexts
()
const
{
std
::
vector
<
const
DeviceContext
*>
all_device_ctx
;
all_device_ctx
.
reserve
(
device_contexts_
.
size
());
for
(
auto
&
dev_ctx
:
device_contexts_
)
{
all_device_ctx
.
emplace_back
(
dev_ctx
.
second
.
get
());
}
return
all_device_ctx
;
template
<
typename
DevCtx
,
typename
PlaceType
>
inline
void
EmplaceDeviceContext
(
std
::
map
<
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
DeviceContext
>>>*
map_ptr
,
platform
::
Place
p
)
{
using
PtrType
=
std
::
unique_ptr
<
DeviceContext
>
;
map_ptr
->
emplace
(
p
,
std
::
async
(
std
::
launch
::
deferred
,
[
=
]
{
// lazy evaluation. i.e., only create device context at
// first `Get`
return
PtrType
(
new
DevCtx
(
boost
::
get
<
PlaceType
>
(
p
)));
}));
}
DeviceContextPool
::
DeviceContextPool
(
const
std
::
vector
<
platform
::
Place
>&
places
)
{
PADDLE_ENFORCE_GT
(
places
.
size
(),
0
);
using
PtrType
=
std
::
unique_ptr
<
DeviceContext
>
;
std
::
set
<
Place
>
set
;
for
(
auto
&
p
:
places
)
{
set
.
insert
(
p
);
...
...
@@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool(
for
(
auto
&
p
:
set
)
{
if
(
platform
::
is_cpu_place
(
p
))
{
#ifdef PADDLE_WITH_MKLDNN
device_contexts_
.
emplace
(
p
,
PtrType
(
new
MKLDNNDeviceContext
(
boost
::
get
<
CPUPlace
>
(
p
))));
EmplaceDeviceContext
<
MKLDNNDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#else
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CPUDeviceContext
(
boost
::
get
<
CPUPlace
>
(
p
))));
EmplaceDeviceContext
<
CPUDeviceContext
,
CPUPlace
>
(
&
device_contexts_
,
p
);
#endif
}
else
if
(
platform
::
is_gpu_place
(
p
))
{
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CUDADeviceContext
(
boost
::
get
<
CUDAPlace
>
(
p
))));
EmplaceDeviceContext
<
CUDADeviceContext
,
CUDAPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...
...
@@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool(
#endif
}
else
if
(
platform
::
is_cuda_pinned_place
(
p
))
{
#ifdef PADDLE_WITH_CUDA
device_contexts_
.
emplace
(
p
,
PtrType
(
new
CUDAPinnedDeviceContext
(
boost
::
get
<
CUDAPinnedPlace
>
(
p
))));
EmplaceDeviceContext
<
CUDAPinnedDeviceContext
,
CUDAPinnedPlace
>
(
&
device_contexts_
,
p
);
#else
PADDLE_THROW
(
"'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
...
...
paddle/fluid/platform/device_context.h
浏览文件 @
a412826c
...
...
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <future> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <string>
...
...
@@ -223,9 +224,6 @@ class DeviceContextPool {
/*! \brief Return handle of single device context. */
platform
::
DeviceContext
*
Get
(
const
platform
::
Place
&
place
);
/*! \brief Return all the device contexts. */
const
std
::
vector
<
const
DeviceContext
*>
GetAllDeviceContexts
()
const
;
template
<
typename
Place
>
const
typename
DefaultDeviceContextType
<
Place
>::
TYPE
*
GetByPlace
(
const
Place
&
place
)
{
...
...
@@ -237,7 +235,8 @@ class DeviceContextPool {
private:
static
DeviceContextPool
*
pool
;
std
::
map
<
Place
,
std
::
unique_ptr
<
DeviceContext
>>
device_contexts_
;
std
::
map
<
Place
,
std
::
shared_future
<
std
::
unique_ptr
<
DeviceContext
>>>
device_contexts_
;
DISABLE_COPY_AND_ASSIGN
(
DeviceContextPool
);
};
...
...
paddle/fluid/platform/dynload/cudnn.h
浏览文件 @
a412826c
...
...
@@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
* include all needed cudnn functions in HPPL
* different cudnn version has different interfaces
**/
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
#define CUDNN_DNN_ROUTINE_EACH(__macro) \
__macro(cudnnSetTensor4dDescriptor); \
__macro(cudnnSetTensor4dDescriptorEx); \
__macro(cudnnSetTensorNdDescriptor); \
__macro(cudnnGetTensorNdDescriptor); \
__macro(cudnnGetConvolutionNdForwardOutputDim); \
__macro(cudnnGetConvolutionForwardAlgorithm); \
__macro(cudnnCreateTensorDescriptor); \
__macro(cudnnDestroyTensorDescriptor); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnSetFilter4dDescriptor); \
__macro(cudnnSetFilterNdDescriptor); \
__macro(cudnnGetFilterNdDescriptor); \
__macro(cudnnSetPooling2dDescriptor); \
__macro(cudnnSetPoolingNdDescriptor); \
__macro(cudnnGetPoolingNdDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnCreateConvolutionDescriptor); \
__macro(cudnnCreatePoolingDescriptor); \
__macro(cudnnDestroyPoolingDescriptor); \
__macro(cudnnSetConvolution2dDescriptor); \
__macro(cudnnDestroyConvolutionDescriptor); \
__macro(cudnnSetConvolutionNdDescriptor); \
__macro(cudnnGetConvolutionNdDescriptor); \
__macro(cudnnDeriveBNTensorDescriptor); \
__macro(cudnnCreateSpatialTransformerDescriptor); \
__macro(cudnnSetSpatialTransformerNdDescriptor); \
__macro(cudnnDestroySpatialTransformerDescriptor); \
__macro(cudnnSpatialTfGridGeneratorForward); \
__macro(cudnnSpatialTfGridGeneratorBackward); \
__macro(cudnnSpatialTfSamplerForward); \
__macro(cudnnSpatialTfSamplerBackward); \
__macro(cudnnCreate); \
__macro(cudnnDestroy); \
__macro(cudnnSetStream); \
__macro(cudnnActivationForward); \
__macro(cudnnConvolutionForward); \
__macro(cudnnConvolutionBackwardBias); \
__macro(cudnnGetConvolutionForwardWorkspaceSize); \
__macro(cudnnTransformTensor); \
__macro(cudnnPoolingForward); \
__macro(cudnnPoolingBackward); \
__macro(cudnnSoftmaxBackward); \
__macro(cudnnSoftmaxForward); \
__macro(cudnnGetVersion); \
__macro(cudnnGetErrorString);
CUDNN_DNN_ROUTINE_EACH
(
DECLARE_DYNAMIC_LOAD_CUDNN_WRAP
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
a412826c
...
...
@@ -821,6 +821,13 @@ All parameter, weight, gradient are variables in Paddle.
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
enable_data_balance_
=
b
;
})
// FIXME(chengudo): enable_data_balance seems not important
.
def_property
(
"enable_sequential_execution"
,
[](
const
BuildStrategy
&
self
)
{
return
self
.
enable_sequential_execution_
;
},
[](
BuildStrategy
&
self
,
bool
b
)
{
self
.
enable_sequential_execution_
=
b
;
})
.
def_property
(
"fuse_elewise_add_act_ops"
,
[](
const
BuildStrategy
&
self
)
{
...
...
paddle/scripts/paddle_build.sh
浏览文件 @
a412826c
...
...
@@ -147,13 +147,11 @@ function cmake_gen() {
-DWITH_SWIG_PY=
${
WITH_SWIG_PY
:-
ON
}
-DCUDNN_ROOT=/usr/
-DWITH_TESTING=
${
WITH_TESTING
:-
ON
}
-DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=
${
WITH_FLUID_ONLY
:-
OFF
}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=
${
WITH_CONTRIB
:-
ON
}
-DWITH_INFERENCE=
${
WITH_INFERENCE
:-
ON
}
-DWITH_INFERENCE_API_TEST=
${
WITH_INFERENCE_API_TEST
:-
ON
}
-DINFERENCE_DEMO_INSTALL_DIR=
${
INFERENCE_DEMO_INSTALL_DIR
}
-DWITH_ANAKIN=
${
WITH_ANAKIN
:-
OFF
}
...
...
@@ -181,12 +179,10 @@ EOF
-DWITH_PYTHON
=
${
WITH_PYTHON
:-
ON
}
\
-DCUDNN_ROOT
=
/usr/
\
-DWITH_TESTING
=
${
WITH_TESTING
:-
ON
}
\
-DWITH_FAST_BUNDLE_TEST
=
ON
\
-DCMAKE_MODULE_PATH
=
/opt/rocm/hip/cmake
\
-DWITH_FLUID_ONLY
=
${
WITH_FLUID_ONLY
:-
OFF
}
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
\
-DWITH_CONTRIB
=
${
WITH_CONTRIB
:-
ON
}
\
-DWITH_INFERENCE
=
${
WITH_INFERENCE
:-
ON
}
\
-DWITH_INFERENCE_API_TEST
=
${
WITH_INFERENCE_API_TEST
:-
ON
}
\
-DINFERENCE_DEMO_INSTALL_DIR
=
${
INFERENCE_DEMO_INSTALL_DIR
}
\
-DWITH_ANAKIN
=
${
WITH_ANAKIN
:-
OFF
}
\
...
...
@@ -653,7 +649,7 @@ function gen_capi_package() {
function
gen_fluid_lib
()
{
mkdir
-p
${
PADDLE_ROOT
}
/build
cd
${
PADDLE_ROOT
}
/build
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Generating fluid library for train and inference ...
...
...
@@ -666,7 +662,7 @@ EOF
}
function
tar_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Taring fluid library for train and inference ...
...
...
@@ -681,7 +677,7 @@ EOF
}
function
test_fluid_lib
()
{
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
&&
${
WITH_INFERENCE
:-
ON
}
==
"ON"
]]
;
then
if
[[
${
WITH_C_API
:-
OFF
}
==
"OFF"
]]
;
then
cat
<<
EOF
========================================
Testing fluid library for inference ...
...
...
python/paddle/fluid/io.py
浏览文件 @
a412826c
...
...
@@ -884,12 +884,13 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
load_prog
=
Program
()
load_block
=
load_prog
.
global_block
()
need_delete_vars
=
[]
for
var_tuple
in
slice_vars_and_attrs
:
orig_var
=
var_tuple
[
0
]
start
=
var_tuple
[
1
]
slice_var
=
var_tuple
[
2
]
end
=
start
+
reduce
(
lambda
x
,
y
:
x
*
y
,
slice_var
.
shape
)
end
=
start
+
slice_var
.
shape
[
0
]
clone_orig_var
=
load_block
.
create_var
(
name
=
orig_var
.
name
,
...
...
@@ -917,5 +918,8 @@ def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs):
attrs
=
{
'axes'
:
[
0
],
'starts'
:
[
start
],
'ends'
:
[
end
]})
need_delete_vars
.
append
(
clone_orig_var
)
load_block
.
append_op
(
type
=
'delete_var'
,
inputs
=
{
'X'
:
need_delete_vars
},
)
executor
.
run
(
load_prog
)
python/paddle/fluid/layers/control_flow.py
浏览文件 @
a412826c
...
...
@@ -1586,8 +1586,7 @@ class DynamicRNN(object):
self
.
lod_rank_table
=
None
self
.
max_seq_len
=
None
self
.
step_idx
=
None
self
.
zero_idx
=
fill_constant
(
shape
=
[
1
],
value
=
0
,
dtype
=
'int64'
,
force_cpu
=
True
)
self
.
zero_idx
=
None
self
.
mem_dict
=
dict
()
self
.
output_array
=
[]
self
.
outputs
=
[]
...
...
@@ -1792,6 +1791,7 @@ class DynamicRNN(object):
"""
self
.
_assert_in_rnn_block_
(
'memory'
)
self
.
_init_zero_idx_
()
if
init
is
not
None
:
if
not
isinstance
(
init
,
Variable
):
raise
TypeError
(
...
...
@@ -1905,6 +1905,22 @@ class DynamicRNN(object):
array_write
(
x
=
each
,
i
=
self
.
step_idx
,
array
=
outside_array
)
self
.
output_array
.
append
(
outside_array
)
def
_init_zero_idx_
(
self
):
if
self
.
zero_idx
is
None
:
parent_block
=
self
.
_parent_block_
()
self
.
zero_idx
=
parent_block
.
create_var
(
name
=
unique_name
.
generate
(
'zero_idx'
),
dtype
=
'int64'
)
parent_block
.
append_op
(
type
=
'fill_constant'
,
inputs
=
{},
outputs
=
{
'Out'
:
[
self
.
zero_idx
]},
attrs
=
{
'shape'
:
[
1
],
'dtype'
:
self
.
zero_idx
.
dtype
,
'value'
:
float
(
0
),
'force_cpu'
:
True
})
def
_parent_block_
(
self
):
prog
=
self
.
helper
.
main_program
parent_idx
=
prog
.
current_block
().
parent_idx
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a412826c
...
...
@@ -154,9 +154,11 @@ __all__ = [
'mul'
,
'sigmoid_cross_entropy_with_logits'
,
'maxout'
,
'affine_grid'
,
'sequence_reverse'
,
'affine_channel'
,
'hash'
,
'grid_sampler'
,
'log_loss'
,
'add_position_encoding'
,
]
...
...
@@ -710,8 +712,18 @@ def dynamic_gru(input,
The first part are weights of the update gate and reset gate with
shape :math:`(D
\\
times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D
\\
times D)`.
bias_attr(ParamAttr): The parameter attribute for learnable the
hidden-hidden bias.
If it is set to None or one attribute of ParamAttr, dynamic_gru will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1
\\
times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, dynamic_gru will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`.
gate_activation(str): The activation for update gate and reset gate.
...
...
@@ -749,7 +761,7 @@ def dynamic_gru(input,
attr
=
helper
.
bias_attr
,
shape
=
[
1
,
3
*
size
],
dtype
=
dtype
,
is_bias
=
True
)
batch_size
=
input
.
shape
[
0
]
inputs
=
{
'Input'
:
input
,
'Weight'
:
weight
,
'Bias'
:
bias
}
if
h_0
!=
None
:
if
h_0
:
assert
h_0
.
shape
==
(
batch_size
,
size
),
'The shape of h0 should be(batch_size, %d)'
%
size
...
...
@@ -810,10 +822,29 @@ def gru_unit(input,
Args:
input (Variable): The fc transformed input value of current step.
hidden (Variable): The hidden value of
lstm
unit from previous step.
hidden (Variable): The hidden value of
gru
unit from previous step.
size (integer): The input dimension value.
param_attr (ParamAttr): The weight parameters for gru unit. Default: None
bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T
\\
times 3D)`, where
:math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with
shape :math:`(D
\\
times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D
\\
times D)`.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
of GRU. Note that the bias with :math:`(1
\\
times 3D)` concatenates
the bias in the update gate, reset gate and candidate calculations.
If it is set to False, no bias will be applied to the update gate,
reset gate and candidate calculations. If it is set to None or one
attribute of ParamAttr, gru_unit will create ParamAttr as
bias_attr. If the Initializer of the bias_attr is not set, the bias
is initialized zero. Default: None.
activation (string): The activation type for cell (actNode).
Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
...
...
@@ -1825,7 +1856,7 @@ def conv3d(input,
return
helper
.
append_activation
(
pre_act
)
def
sequence_pool
(
input
,
pool_type
):
def
sequence_pool
(
input
,
pool_type
,
is_test
=
False
):
"""
This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied
...
...
@@ -1862,6 +1893,7 @@ def sequence_pool(input, pool_type):
input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max.
is_test(bool, Default False): Used distinguish training from scoring mode.
Returns:
The sequence pooling variable which is a Tensor.
...
...
@@ -1889,7 +1921,8 @@ def sequence_pool(input, pool_type):
inputs
=
{
"X"
:
input
},
outputs
=
{
"Out"
:
pool_out
,
"MaxIndex"
:
max_index
},
attrs
=
{
"pooltype"
:
pool_type
.
upper
()})
attrs
=
{
"pooltype"
:
pool_type
.
upper
(),
"is_test"
:
is_test
})
# when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here
...
...
@@ -2069,7 +2102,8 @@ def pool2d(input,
global_pooling
=
False
,
use_cudnn
=
True
,
ceil_mode
=
False
,
name
=
None
):
name
=
None
,
exclusive
=
True
):
"""
${comment}
...
...
@@ -2083,11 +2117,13 @@ def pool2d(input,
pool_type: ${pooling_type_comment}
pool_stride (int): stride of the pooling layer.
pool_padding (int): padding size.
global_pooling: ${global_pooling_comment}
use_cudnn: ${use_cudnn_comment}
ceil_mode: ${ceil_mode_comment}
global_pooling
(bool)
: ${global_pooling_comment}
use_cudnn
(bool)
: ${use_cudnn_comment}
ceil_mode
(bool)
: ${ceil_mode_comment}
name (str|None): A name for this layer(optional). If set None, the
layer will be named automatically.
exclusive (bool): Whether to exclude padding points in average pooling
mode, default is true
Returns:
Variable: The pooling result.
...
...
@@ -2145,7 +2181,8 @@ def pool2d(input,
"paddings"
:
pool_padding
,
"use_cudnn"
:
use_cudnn
,
"ceil_mode"
:
ceil_mode
,
"use_mkldnn"
:
False
"use_mkldnn"
:
False
,
"exclusive"
:
exclusive
,
})
return
pool_out
...
...
@@ -2159,7 +2196,8 @@ def pool3d(input,
global_pooling
=
False
,
use_cudnn
=
True
,
ceil_mode
=
False
,
name
=
None
):
name
=
None
,
exclusive
=
True
):
"""
This function adds the operator for pooling in 3-dimensions, using the
pooling configurations mentioned in input parameters.
...
...
@@ -2175,6 +2213,8 @@ def pool3d(input,
ceil_mode (bool): ${ceil_mode_comment}
name (str): A name for this layer(optional). If set None, the layer
will be named automatically.
exclusive (bool): Whether to exclude padding points in average pooling
mode, default is true
Returns:
Variable: output of pool3d layer.
...
...
@@ -2213,7 +2253,8 @@ def pool3d(input,
"paddings"
:
pool_padding
,
"use_cudnn"
:
use_cudnn
,
"ceil_mode"
:
ceil_mode
,
"use_mkldnn"
:
False
"use_mkldnn"
:
False
,
"exclusive"
:
exclusive
,
})
return
pool_out
...
...
@@ -3018,7 +3059,8 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
x = fluid.layers.data(name='y', shape=[10, 5],
dtype='float32', lod_level=1)
pad_value = fluid.layers.assign(input=numpy.array([0]))
pad_value = fluid.layers.assign(
input=numpy.array([0], dtype=numpy.float32))
out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
"""
...
...
@@ -4440,7 +4482,10 @@ def transpose(x, perm, name=None):
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
# use append_batch_size=False to avoid prepending extra
# batch size in shape
x = fluid.layers.data(name='x', shape=[5, 10, 15],
dtype='float32', append_batch_size=False)
x_transposed = layers.transpose(x, perm=[1, 0, 2])
"""
...
...
@@ -4677,7 +4722,8 @@ def multiplex(inputs, index):
def
softmax_with_cross_entropy
(
logits
,
label
,
soft_label
=
False
,
ignore_index
=-
100
):
ignore_index
=-
100
,
numeric_stable_mode
=
False
):
"""
**Softmax With Cross Entropy Operator.**
...
...
@@ -4711,6 +4757,18 @@ def softmax_with_cross_entropy(logits,
\\
left(
\\
text{logit}_i -
\\
log
\\
left(
\\
sum_{i=0}^{K}
\\
exp(
\\
text{logit}_i)
\\
right)
\\
right), j = 1,...,K
3) If numeric_stable_mode is True, softmax is calculated first by:
.. math::
max_j =
\\
max_{i=0}^{K}{
\\
text{logit}_i}
log
\\
_max
\\
_sum_j =
\\
log
\\
sum_{i=0}^{K}
\\
exp(logit_i - max_j)
softmax_j =
\\
exp(logit_j - max_j - {log
\\
_max
\\
_sum}_j)
and then cross entropy loss is calculated by softmax and label.
Args:
logits (Variable): The unscaled log probabilities, which is a 2-D tensor
with shape [N x K]. N is the batch_size, and K is the class number.
...
...
@@ -4722,6 +4780,13 @@ def softmax_with_cross_entropy(logits,
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100
numeric_stable_mode (bool): A flag to indicate whether to use a more
numerically stable algorithm. Only valid
when soft_label is False and GPU is used.
When soft_label is True or CPU is used,
the algorithm is always numerically stable.
Note that the speed may be slower when use
stable algorithm. Default: False
Returns:
Variable: The cross entropy loss is a 2-D tensor with shape [N x 1].
...
...
@@ -4744,8 +4809,11 @@ def softmax_with_cross_entropy(logits,
'Label'
:
label
},
outputs
=
{
'Softmax'
:
softmax
,
'Loss'
:
loss
},
attrs
=
{
'soft_label'
:
soft_label
,
'ignore_index'
:
ignore_index
})
attrs
=
{
'soft_label'
:
soft_label
,
'ignore_index'
:
ignore_index
,
'numeric_stable_mode'
:
numeric_stable_mode
})
return
loss
...
...
@@ -6105,6 +6173,124 @@ def crop(x, shape=None, offsets=None, name=None):
return
out
def
affine_grid
(
theta
,
out_shape
,
name
=
None
):
"""
It generates a grid of (x,y) coordinates using the parameters of
the affine transformation that correspond to a set of points where
the input feature map should be sampled to produce the transformed
output feature map.
.. code-block:: text
* Case 1:
Given:
theta = [[[x_11, x_12, x_13]
[x_14, x_15, x_16]]
[[x_21, x_22, x_23]
[x_24, x_25, x_26]]]
out_shape = [2, 3, 5, 5]
Step 1:
Generate normalized coordinates according to out_shape.
The values of the normalized coordinates are in the interval between -1 and 1.
The shape of the normalized coordinates is [2, H, W] as below:
C = [[[-1. -1. -1. -1. -1. ]
[-0.5 -0.5 -0.5 -0.5 -0.5]
[ 0. 0. 0. 0. 0. ]
[ 0.5 0.5 0.5 0.5 0.5]
[ 1. 1. 1. 1. 1. ]]
[[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis.
Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get:
C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ]
[ 0. -1. 1. ]
[ 0.5 -1. 1. ]
[ 1. -1. 1. ]
[-1. -0.5 1. ]
[-0.5 -0.5 1. ]
[ 0. -0.5 1. ]
[ 0.5 -0.5 1. ]
[ 1. -0.5 1. ]
[-1. 0. 1. ]
[-0.5 0. 1. ]
[ 0. 0. 1. ]
[ 0.5 0. 1. ]
[ 1. 0. 1. ]
[-1. 0.5 1. ]
[-0.5 0.5 1. ]
[ 0. 0.5 1. ]
[ 0.5 0.5 1. ]
[ 1. 0.5 1. ]
[-1. 1. 1. ]
[-0.5 1. 1. ]
[ 0. 1. 1. ]
[ 0.5 1. 1. ]
[ 1. 1. 1. ]]
Step3:
Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
Args:
theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
out_shape can be a Variable or a list or tuple.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The output with shape [N, H, W, 2].
Raises:
ValueError: If the type of arguments is not supported.
Examples:
.. code-block:: python
theta = fluid.layers.data(name="x", shape=[2, 3], dtype="float32")
out_shape = fluid.layers.data(name="y", shape=[-1], dtype="float32")
data = fluid.layers.affine_grid(theta, out_shape)
# or
data = fluid.layers.affine_grid(theta, [5, 3, 28, 28])
"""
helper
=
LayerHelper
(
'affine_grid'
)
if
not
(
isinstance
(
out_shape
,
list
)
or
isinstance
(
out_shape
,
tuple
)
or
\
isinstance
(
out_shape
,
Variable
)):
raise
ValueError
(
"The out_shape should be a list, tuple or Variable."
)
if
not
isinstance
(
theta
,
Variable
):
raise
ValueError
(
"The theta should be a Variable."
)
out
=
helper
.
create_variable_for_type_inference
(
theta
.
dtype
)
ipts
=
{
'Theta'
:
theta
}
attrs
=
{}
if
isinstance
(
out_shape
,
Variable
):
ipts
[
'OutputShape'
]
=
out_shape
else
:
attrs
[
'output_shape'
]
=
out_shape
helper
.
append_op
(
type
=
'affine_grid'
,
inputs
=
ipts
,
outputs
=
{
'Output'
:
out
},
attrs
=
None
if
len
(
attrs
)
==
0
else
attrs
)
return
out
def
rank_loss
(
label
,
left
,
right
,
name
=
None
):
"""
**Rank loss layer for RankNet**
...
...
@@ -7319,10 +7505,10 @@ def clip(x, min, max, name=None):
helper
=
LayerHelper
(
"clip"
,
**
locals
())
if
name
is
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
name
=
unique_name
.
generate
(
"."
.
join
([
helper
.
name
,
'tmp'
])
)
out
=
helper
.
create_variable
(
type
=
x
.
type
,
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"clip"
,
...
...
@@ -7351,10 +7537,10 @@ def clip_by_norm(x, max_norm, name=None):
helper
=
LayerHelper
(
"clip_by_norm"
,
**
locals
())
if
name
is
None
:
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
else
:
out
=
helper
.
create_variable
(
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
name
=
unique_name
.
generate
(
"."
.
join
([
helper
.
name
,
'tmp'
])
)
out
=
helper
.
create_variable
(
type
=
x
.
type
,
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"clip_by_norm"
,
...
...
@@ -7558,19 +7744,59 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
def
hash
(
input
,
hash_size
,
num_hash
=
1
,
name
=
None
):
"""
hash the input
Args:
input (Variable): The input variable which is a one-hot word.
hash_size (int): The space size for hash algorithm.
Hash the input to an integer whose value is less than the given hash size.
The hash algorithm we used was xxHash - Extremely fast hash algorithm
(https://github.com/Cyan4973/xxHash/tree/v0.6.5)
A simple example as below:
.. code-block:: text
Given:
# shape [2, 2]
input.data = [
[[1], [2]],
[[3], [4]],
]
input.lod = [[0, 2]]
hash_size = 10000
num_hash = 4
Then:
Hash op will take all number in input's 2nd dimension as hash algorithm's
input for each time. Each input will be hashed for 4 times, and get an
array whose length is 4. Each value in the array ranges from 0 to 9999.
# shape [2, 4]
output.data = [
[[9662], [9217], [1129], [8487]],
[[8310], [1327], [1654], [4567]],
]
output.lod = [[0, 2]]
Args:
input (Variable): The input variable which is a one-hot word. The
dimensions of the input variable must be 2.
hash_size (int): The space size for hash algorithm. The output value
will keep in the range:math:`[0, hash_size - 1]`.
num_hash (int): The times of hash, default 1.
name (str, default None): The name of this layer.
Returns:
Variable: The hash result variable which is a LoDTensor.
Examples:
.. code-block:: python
word_dict = paddle.dataset.imdb.word_dict()
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
out = fluid.layers.hash(input=x, len(word_dict))
Returns:
Variable: The hash result variable which is a LoDTensor.
Examples:
.. code-block:: python
word_dict = paddle.dataset.imdb.word_dict()
x = fluid.layers.data(shape[1], dtype='int32', lod_level=1)
out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
"""
helper
=
LayerHelper
(
'hash'
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
...
...
@@ -7584,6 +7810,87 @@ def hash(input, hash_size, num_hash=1, name=None):
return
out
@
templatedoc
()
def
grid_sampler
(
x
,
grid
,
name
=
None
):
"""
This operation samples input X by using bilinear interpolation based on
flow field grid, which is usually gennerated by affine_grid. The grid of
shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
with shape [N, H, W] each, where grid_x is indexing the 4th dimension
(in width dimension) of input data x and grid_y is indexng the 3rd
dimention (in height dimension), finally results is the bilinear
interpolation value of 4 nearest corner points.
Step 1:
Get (x, y) grid coordinates and scale to [0, H-1/W-1].
grid_x = 0.5 * (grid[:, :, :, 0] + 1) * (W - 1)
grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
Step 2:
Indices input data X with grid (x, y) in each [H, W] area, and bilinear
interpolate point value by 4 nearest points.
wn ------- y_n ------- en
| | |
| d_n |
| | |
x_w --d_w-- grid--d_e-- x_e
| | |
| d_s |
| | |
ws ------- y_s ------- wn
x_w = floor(x) // west side x coord
x_e = x_w + 1 // east side x coord
y_n = floor(y) // north side y coord
y_s = y_s + 1 // south side y coord
d_w = grid_x - x_w // distance to west side
d_e = x_e - grid_x // distance to east side
d_n = grid_y - y_n // distance to north side
d_s = y_s - grid_y // distance to south side
wn = X[:, :, y_n, x_w] // north-west point value
en = X[:, :, y_n, x_e] // north-east point value
ws = X[:, :, y_s, x_w] // south-east point value
es = X[:, :, y_s, x_w] // north-east point value
output = wn * d_e * d_s + en * d_w * d_s
+ ws * d_e * d_n + es * d_w * d_n
Args:
x(Variable): Input data of shape [N, C, H, W].
grid(Variable): Input grid tensor of shape [N, H, W, 2].
name (str, default None): The name of this layer.
Returns:
out(Variable): Output of shape [N, C, H, W] data samples input X
using bilnear interpolation based on input grid.
Exmples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
out = fluid.layers.grid_sampler(x=x, grid=grid)
"""
helper
=
LayerHelper
(
"grid_sampler"
,
**
locals
())
if
not
isinstance
(
x
,
Variable
):
return
ValueError
(
"The x should be a Variable"
)
if
not
isinstance
(
grid
,
Variable
):
return
ValueError
(
"The grid should be a Variable"
)
out
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
ipts
=
{
'X'
:
x
,
'Grid'
:
grid
}
helper
.
append_op
(
type
=
'grid_sampler'
,
inputs
=
ipts
,
outputs
=
{
'Output'
:
out
})
return
out
def
log_loss
(
input
,
label
,
epsilon
=
1e-4
,
name
=
None
):
"""
**Negative Log Loss Layer**
...
...
python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
浏览文件 @
a412826c
file
(
GLOB TEST_OPS RELATIVE
"
${
CMAKE_CURRENT_SOURCE_DIR
}
"
"test_*.py"
)
string
(
REPLACE
".py"
""
TEST_OPS
"
${
TEST_OPS
}
"
)
# default test
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endforeach
()
if
(
NOT APPLE
)
# default test
foreach
(
src
${
TEST_OPS
}
)
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endforeach
()
else
()
foreach
(
src
${
TEST_OPS
}
)
if
(
${
src
}
STREQUAL
"test_image_classification_vgg"
)
message
(
WARNING
"These tests has been disabled in OSX for random fail:
\n
"
${
src
}
)
elseif
(
${
src
}
STREQUAL
"test_image_classification_resnet"
)
message
(
WARNING
"These tests has been disabled in OSX for random fail:
\n
"
${
src
}
)
elseif
()
py_test
(
${
src
}
SRCS
${
src
}
.py
)
endif
()
endforeach
()
endif
()
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
a412826c
...
...
@@ -17,6 +17,10 @@ if(NOT WITH_DISTRIBUTE)
list
(
REMOVE_ITEM TEST_OPS test_listen_and_serv_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_word2vec
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_ctr
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_simnet_bow
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge
)
LIST
(
REMOVE_ITEM TEST_OPS test_dist_text_classification
)
endif
(
NOT WITH_DISTRIBUTE
)
list
(
REMOVE_ITEM TEST_OPS test_seq_concat_op
)
# FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
...
...
@@ -55,6 +59,7 @@ function(py_test_modules TARGET_NAME)
if
(
py_test_modules_SERIAL
)
set_property
(
TEST
${
TARGET_NAME
}
PROPERTY RUN_SERIAL 1
)
endif
()
set_tests_properties
(
${
TARGET_NAME
}
PROPERTIES TIMEOUT 600
)
endif
()
endfunction
()
list
(
REMOVE_ITEM TEST_OPS test_warpctc_op
)
...
...
@@ -88,4 +93,6 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
py_test_modules
(
test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL
)
set_tests_properties
(
test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150
)
py_test_modules
(
test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
if
(
NOT APPLE
)
py_test_modules
(
test_image_classification_resnet MODULES test_image_classification_resnet SERIAL
)
endif
()
python/paddle/fluid/tests/unittests/dist_mnist.py
浏览文件 @
a412826c
...
...
@@ -90,8 +90,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
0.001
,
beta1
=
0.9
,
beta2
=
0.999
)
# TODO(typhoonzero): fix distributed adam optimizer
# opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
...
...
python/paddle/fluid/tests/unittests/dist_save_load.py
0 → 100644
浏览文件 @
a412826c
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
sys
import
signal
import
subprocess
import
argparse
import
time
import
math
import
random
from
multiprocessing
import
Process
from
functools
import
reduce
import
numpy
as
np
import
unittest
import
six
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
io
from
test_dist_base
import
TestDistRunnerBase
,
runtime_main
,
RUN_STEP
from
dist_simnet_bow
import
TestDistSimnetBow2x2
,
DATA_URL
,
DATA_MD5
class
TestDistSaveLoad2x2
(
TestDistSimnetBow2x2
):
def
_load_persistable_vars
(
self
,
executor
,
dirname
,
program
):
def
_is_checkpoint_var
(
var
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
\
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
:
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
return
var
.
persistable
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
,
filename
=
None
)
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
pserver_prog
=
t
.
get_pserver_program
(
args
.
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
args
.
current_endpoint
,
pserver_prog
)
need_load
=
bool
(
int
(
os
.
getenv
(
"LOAD"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
if
need_load
and
model_dir
:
self
.
_load_persistable_vars
(
exe
,
model_dir
,
startup_prog
)
exe
.
run
(
pserver_prog
)
def
run_trainer
(
self
,
args
):
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
=
\
self
.
get_model
(
batch_size
=
2
)
if
args
.
mem_opt
:
fluid
.
memory_optimize
(
fluid
.
default_main_program
(),
skip_grads
=
True
)
if
args
.
is_dist
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
strategy
.
allow_op_delay
=
False
build_stra
=
fluid
.
BuildStrategy
()
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
else
:
build_stra
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
exe
=
fluid
.
ParallelExecutor
(
args
.
use_cuda
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
reader_generator
=
train_reader
()
def
get_data
():
origin_batch
=
next
(
reader_generator
)
if
args
.
is_dist
and
args
.
use_reader_alloc
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
origin_batch
):
if
offset
%
2
==
args
.
trainer_id
:
new_batch
.
append
(
item
)
return
new_batch
else
:
return
origin_batch
need_save
=
bool
(
int
(
os
.
getenv
(
"SAVE"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
if
need_save
:
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
if
need_save
and
model_dir
:
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
var
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'__fc_b__'
).
get_tensor
())
print
(
np
.
ravel
(
var
).
tolist
())
if
__name__
==
"__main__"
:
paddle
.
dataset
.
common
.
download
(
DATA_URL
,
'simnet'
,
DATA_MD5
,
"train"
)
runtime_main
(
TestDistSaveLoad2x2
)
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
a412826c
...
...
@@ -40,7 +40,8 @@ class TestParallelExecutorBase(unittest.TestCase):
use_reduce
=
False
,
fuse_elewise_add_act_ops
=
False
,
optimizer
=
fluid
.
optimizer
.
Adam
,
use_fast_executor
=
False
):
use_fast_executor
=
False
,
enable_sequential_execution
=
False
):
def
run_executor
(
exe
,
feed
,
fetch_list
,
program
=
None
):
if
isinstance
(
exe
,
fluid
.
ParallelExecutor
):
res
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
feed
)
...
...
@@ -80,6 +81,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
reduce_strategy
=
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
\
if
use_reduce
else
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
build_strategy
.
fuse_elewise_add_act_ops
=
fuse_elewise_add_act_ops
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_parallel_executor
:
exe
=
fluid
.
ParallelExecutor
(
...
...
python/paddle/fluid/tests/unittests/test_affine_grid_op.py
0 → 100644
浏览文件 @
a412826c
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
AffineGrid
(
theta
,
size
):
n
=
size
[
0
]
w
=
size
[
3
]
h
=
size
[
2
]
h_idx
=
np
.
repeat
(
np
.
linspace
(
-
1
,
1
,
h
)[
np
.
newaxis
,
:],
w
,
axis
=
0
).
T
[:,
:,
np
.
newaxis
]
w_idx
=
np
.
repeat
(
np
.
linspace
(
-
1
,
1
,
w
)[
np
.
newaxis
,
:],
h
,
axis
=
0
)[:,
:,
np
.
newaxis
]
grid
=
np
.
concatenate
(
[
w_idx
,
h_idx
,
np
.
ones
([
h
,
w
,
1
])],
axis
=
2
)
# h * w * 3
grid
=
np
.
repeat
(
grid
[
np
.
newaxis
,
:],
size
[
0
],
axis
=
0
)
# n * h * w *3
ret
=
np
.
zeros
([
n
,
h
*
w
,
2
])
theta
=
theta
.
transpose
([
0
,
2
,
1
])
for
i
in
range
(
len
(
theta
)):
ret
[
i
]
=
np
.
dot
(
grid
[
i
].
reshape
([
h
*
w
,
3
]),
theta
[
i
])
# print ret.reshape([h * w, 2]).astype("float32")
return
ret
.
reshape
([
n
,
h
,
w
,
2
]).
astype
(
"float32"
)
class
TestAffineGridOp
(
OpTest
):
def
setUp
(
self
):
self
.
initTestCase
()
self
.
op_type
=
"affine_grid"
theta
=
np
.
random
.
randint
(
1
,
3
,
self
.
theta_shape
).
astype
(
"float32"
)
theta
=
np
.
ones
(
self
.
theta_shape
).
astype
(
"float32"
)
self
.
inputs
=
{
'Theta'
:
theta
}
self
.
attrs
=
{
"use_cudnn"
:
True
}
if
self
.
dynamic_shape
:
self
.
inputs
[
'OutputShape'
]
=
self
.
output_shape
else
:
self
.
attrs
[
'output_shape'
]
=
self
.
output_shape
self
.
outputs
=
{
'Output'
:
AffineGrid
(
theta
,
self
.
output_shape
)}
def
test_check_output
(
self
):
self
.
check_output
()
def
test_check_grad_normal
(
self
):
self
.
check_grad
(
[
'Theta'
],
'Output'
,
no_grad_set
=
[
'OutputShape'
],
max_relative_error
=
0.006
)
def
initTestCase
(
self
):
self
.
theta_shape
=
(
3
,
2
,
3
)
self
.
output_shape
=
np
.
array
([
3
,
2
,
5
,
7
]).
astype
(
"int32"
)
self
.
dynamic_shape
=
False
class
TestAffineGridOpCase1
(
TestAffineGridOp
):
def
initTestCase
(
self
):
self
.
theta_shape
=
(
3
,
2
,
3
)
self
.
output_shape
=
np
.
array
([
3
,
2
,
5
,
7
]).
astype
(
"int32"
)
self
.
dynamic_shape
=
True
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
a412826c
...
...
@@ -22,6 +22,8 @@ import signal
import
subprocess
import
six
import
argparse
import
pickle
import
numpy
as
np
import
paddle.fluid
as
fluid
...
...
@@ -128,10 +130,15 @@ class TestDistRunnerBase(object):
else
:
return
origin_batch
out_losses
=
[]
for
_
in
six
.
moves
.
xrange
(
RUN_STEP
):
loss
,
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
()))
print
(
loss
)
out_losses
.
append
(
loss
[
0
])
if
six
.
PY2
:
print
(
pickle
.
dumps
(
out_losses
))
else
:
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
out_losses
))
def
runtime_main
(
test_class
):
...
...
@@ -149,7 +156,7 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--use_cuda'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reduce'
,
action
=
'store_true'
)
parser
.
add_argument
(
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
,
default
=
True
)
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
)
parser
.
add_argument
(
'--batch_size'
,
required
=
False
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--batch_merge_repeat'
,
required
=
False
,
type
=
int
,
default
=
1
)
...
...
@@ -188,7 +195,7 @@ class TestDistBase(unittest.TestCase):
self
.
_pservers
=
2
self
.
_ps_endpoints
=
"127.0.0.1:%s,127.0.0.1:%s"
%
(
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
"python"
self
.
_python_interp
=
sys
.
executable
self
.
_sync_mode
=
True
self
.
_enforce_place
=
None
self
.
_mem_opt
=
False
...
...
@@ -237,21 +244,6 @@ class TestDistBase(unittest.TestCase):
return
ps0_proc
,
ps1_proc
,
ps0_pipe
,
ps1_pipe
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
while
True
:
assert
retry_times
>=
0
,
"wait ps ready failed"
time
.
sleep
(
3
)
try
:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os
.
stat
(
"/tmp/paddle.%d.port"
%
pid
)
return
except
os
.
error
as
e
:
sys
.
stderr
.
write
(
'waiting for pserver: %s, left retry %d
\n
'
%
(
e
,
retry_times
))
retry_times
-=
1
def
_run_local
(
self
,
model
,
envs
,
...
...
@@ -288,23 +280,20 @@ class TestDistBase(unittest.TestCase):
env
=
envs
)
local_out
,
local_err
=
local_proc
.
communicate
()
local_ret
=
cpt
.
to_text
(
local_out
)
if
check_error_log
:
err_log
.
close
()
sys
.
stderr
.
write
(
'local_stdout: %s
\n
'
%
local_ret
)
sys
.
stderr
.
write
(
'local_stdout: %s
\n
'
%
pickle
.
loads
(
local_out
)
)
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
local_err
)
local_losses
=
local_ret
.
split
(
"
\n
"
)
return
local_losses
return
pickle
.
loads
(
local_out
)
def
_run_cluster
(
self
,
model
,
envs
,
check_error_log
):
# Run dist train to compare with local results
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
start_pserver
(
model
,
check_error_log
,
envs
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist"
...
...
@@ -339,8 +328,8 @@ class TestDistBase(unittest.TestCase):
env0
.
update
(
envs
)
env1
.
update
(
envs
)
print
(
"tr0_cmd:{}
, env0: {}"
.
format
(
tr0_cmd
,
env0
))
print
(
"tr1_cmd:{}
, env1: {}"
.
format
(
tr1_cmd
,
env1
))
print
(
"tr0_cmd:{}
"
.
format
(
tr0_cmd
))
print
(
"tr1_cmd:{}
"
.
format
(
tr1_cmd
))
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
...
...
@@ -356,9 +345,7 @@ class TestDistBase(unittest.TestCase):
env
=
env1
)
tr0_out
,
tr0_err
=
tr0_proc
.
communicate
()
tr0_loss_text
=
cpt
.
to_text
(
tr0_out
)
tr1_out
,
tr1_err
=
tr1_proc
.
communicate
()
tr1_loss_text
=
cpt
.
to_text
(
tr1_out
)
# close trainer file
tr0_pipe
.
close
()
...
...
@@ -373,15 +360,13 @@ class TestDistBase(unittest.TestCase):
ps1
.
terminate
()
# print log
sys
.
stderr
.
write
(
'trainer 0 stdout:
\n
%s
\n
'
%
tr0_loss_text
)
sys
.
stderr
.
write
(
'trainer 0 stderr:
\n
%s
\n
'
%
tr0_err
)
sys
.
stderr
.
write
(
'trainer 1 stdout: %s
\n
'
%
tr1_loss_text
)
sys
.
stderr
.
write
(
'trainer 0 stdout:
%s
\n
'
%
pickle
.
loads
(
tr0_out
)
)
sys
.
stderr
.
write
(
'trainer 0 stderr: %s
\n
'
%
tr0_err
)
sys
.
stderr
.
write
(
'trainer 1 stdout: %s
\n
'
%
pickle
.
loads
(
tr1_out
)
)
sys
.
stderr
.
write
(
'trainer 1 stderr: %s
\n
'
%
tr1_err
)
tr0_losses
=
tr0_loss_text
.
split
(
"
\n
"
)
tr1_losses
=
tr1_loss_text
.
split
(
"
\n
"
)
return
tr0_losses
,
tr1_losses
# return tr0_losses, tr1_losses
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
)
def
check_with_place
(
self
,
model_file
,
...
...
@@ -411,9 +396,9 @@ class TestDistBase(unittest.TestCase):
check_error_log
)
for
step_id
in
range
(
RUN_STEP
):
local_loss
=
eval
(
local_losses
[
step_id
])[
0
]
tr0_loss
=
eval
(
tr0_losses
[
step_id
])[
0
]
tr1_loss
=
eval
(
tr1_losses
[
step_id
])[
0
]
dist_loss
=
(
tr0_loss
+
tr1_loss
)
/
2
print
(
str
(
local_loss
)
+
":"
+
str
(
dist_loss
)
)
self
.
assertAlmostEqual
(
local_loss
,
dist_loss
,
delta
=
delta
)
local_loss
=
local_losses
[
step_id
]
tr0_loss
=
tr0_losses
[
step_id
]
tr1_loss
=
tr1_losses
[
step_id
]
dist_loss
=
(
np
.
array
([
tr0_loss
])
+
np
.
array
([
tr1_loss
])
)
/
2
print
(
"======="
,
local_loss
,
":"
,
dist_loss
[
0
],
"======="
)
self
.
assertAlmostEqual
(
local_loss
,
dist_loss
[
0
]
,
delta
=
delta
)
python/paddle/fluid/tests/unittests/test_dist_save_load.py
0 → 100644
浏览文件 @
a412826c
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
os
import
shutil
import
unittest
import
tempfile
import
numpy
as
np
from
test_dist_base
import
TestDistBase
,
RUN_STEP
class
TestDistSaveLoadDense2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{}):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"http_proxy"
:
""
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_v"
]
=
"7"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
model_dir
=
tempfile
.
mkdtemp
()
local_env
=
{}
local_env
[
"SAVE"
]
=
"1"
local_env
[
"MODEL_DIR"
]
=
model_dir
local_env
.
update
(
required_envs
)
cluster_env
=
{}
cluster_env
[
"LOAD"
]
=
"1"
cluster_env
[
"MODEL_DIR"
]
=
model_dir
cluster_env
.
update
(
required_envs
)
local_var
=
self
.
_run_local
(
model_file
,
local_env
,
check_error_log
)
tr0_var
,
tr1_var
=
self
.
_run_cluster
(
model_file
,
cluster_env
,
check_error_log
)
shutil
.
rmtree
(
model_dir
)
local_np
=
np
.
array
(
eval
(
local_var
[
0
]))
train0_np
=
np
.
array
(
eval
(
tr0_var
[
0
]))
train1_np
=
np
.
array
(
eval
(
tr1_var
[
0
]))
self
.
assertAlmostEqual
(
local_np
.
all
(),
train0_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
local_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
self
.
assertAlmostEqual
(
train0_np
.
all
(),
train1_np
.
all
(),
delta
=
delta
)
@
unittest
.
skip
(
reason
=
"CI fail"
)
def
test_dist
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
}
self
.
check_with_place
(
"dist_save_load.py"
,
delta
=
0
,
check_error_log
=
False
,
need_envs
=
need_envs
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
浏览文件 @
a412826c
...
...
@@ -23,16 +23,17 @@ class TestDistSeResneXt2x2(TestDistBase):
self
.
_use_reader_alloc
=
False
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
00
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
e-7
)
class
TestDistseResnXt2x2WithMemopt
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_mem_opt
=
True
self
.
_use_reader_alloc
=
False
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
00
)
self
.
check_with_place
(
"dist_se_resnext.py"
,
delta
=
1
e-7
)
class
TestDistSeResneXt2x2Async
(
TestDistBase
):
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
a412826c
...
...
@@ -283,6 +283,25 @@ class TestDecayedAdagrad(TranspilerTest):
trainer
,
_
=
self
.
get_trainer
()
class
TestFtrl
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1000
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'fc_w'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'fc_b'
))
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
opt
=
fluid
.
optimizer
.
Ftrl
(
learning_rate
=
0.1
)
opt
.
minimize
(
avg_cost
)
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
trainer
,
_
=
self
.
get_trainer
()
class
TestLRDecayConditional
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
...
...
@@ -405,18 +424,43 @@ class TestL2DecayWithPiecewise(TranspilerTest):
[
"sum"
,
"scale"
,
"scale"
,
"elementwise_add"
,
"momentum"
])
class
TestEmptyPserverOptimizeBlocks
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
# only one parameter
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1000
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'fc_w'
),
bias_attr
=
False
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
1.0
)
sgd_optimizer
.
minimize
(
avg_cost
)
def
transpiler_test_impl
(
self
):
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
slice_var_up
=
False
pserver
,
startup
=
self
.
get_pserver
(
ep
=
self
.
pserver2_ep
,
config
=
config
)
self
.
assertEqual
(
len
(
pserver
.
blocks
),
2
)
self
.
assertEqual
(
len
(
pserver
.
blocks
[
1
].
ops
),
0
)
class
TestDistLookupTableBase
(
TranspilerTest
):
def
network_with_table
(
self
,
is_sparse
,
is_distributed
):
self
.
table_size
=
1000
self
.
emb_size
=
64
self
.
lookup_table_name
=
'shared_w'
def
emb_pool
(
ids
):
def
emb_pool
(
ids
,
table_name
,
is_distributed
):
emb
=
fluid
.
layers
.
embedding
(
input
=
ids
,
size
=
[
self
.
table_size
,
self
.
emb_size
],
dtype
=
'float32'
,
param_attr
=
self
.
lookup_table_name
,
# share parameter
param_attr
=
table_name
,
is_sparse
=
is_sparse
,
is_distributed
=
is_distributed
)
pool
=
fluid
.
layers
.
sequence_pool
(
input
=
emb
,
pool_type
=
'average'
)
...
...
@@ -426,9 +470,13 @@ class TestDistLookupTableBase(TranspilerTest):
name
=
'title_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
brand_ids
=
fluid
.
layers
.
data
(
name
=
'brand_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
title_emb
=
emb_pool
(
title_ids
)
brand_emb
=
emb_pool
(
brand_ids
)
fc0
=
fluid
.
layers
.
concat
(
input
=
[
title_emb
,
brand_emb
],
axis
=
1
)
profile_ids
=
fluid
.
layers
.
data
(
name
=
'brand_ids'
,
shape
=
[
1
],
dtype
=
'int64'
,
lod_level
=
1
)
title_emb
=
emb_pool
(
title_ids
,
self
.
lookup_table_name
,
is_distributed
)
brand_emb
=
emb_pool
(
brand_ids
,
self
.
lookup_table_name
,
is_distributed
)
profile_emb
=
emb_pool
(
profile_ids
,
"profile_emb"
,
False
)
fc0
=
fluid
.
layers
.
concat
(
input
=
[
title_emb
,
brand_emb
,
profile_emb
],
axis
=
1
)
predict
=
fluid
.
layers
.
fc
(
input
=
fc0
,
size
=
2
,
act
=
None
,
...
...
@@ -449,7 +497,7 @@ class TestLocalLookupTable(TestDistLookupTableBase):
def
transpiler_test_impl
(
self
):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
3
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
4
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
...
...
@@ -459,16 +507,23 @@ class TestLocalLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
trainer
,
_
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -480,39 +535,45 @@ class TestDistLookupTable(TestDistLookupTableBase):
def
transpiler_test_impl
(
self
):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
5
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
6
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
#
2 optimize for table sgd
#
4 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sum"
,
"scale"
,
"adam"
,
"scale"
,
"scale"
])
# 2 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sum"
,
"sgd"
])
# 3 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"lookup_sparse_table"
])
#
4
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"save"
])
#
5
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
,
trainer_startup
=
self
.
get_trainer
()
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'fetch_barrier'
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'send_barrier'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
startup_ops
=
[
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'fake_init'
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'fill_constant'
,
'uniform_random'
,
'uniform_random'
,
'recv'
,
'recv'
,
'recv'
,
'fetch_barrier'
,
'concat'
,
'fake_init'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer_startup
.
blocks
[
0
].
ops
],
startup_ops
)
...
...
@@ -526,7 +587,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
config
=
fluid
.
DistributeTranspilerConfig
()
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
,
False
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
3
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
4
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
...
...
@@ -535,17 +596,23 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase):
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'lookup_table'
,
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_selected_rows'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
@@ -559,29 +626,34 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
pserver1
,
startup1
=
self
.
get_pserver
(
self
.
pserver1_ep
,
config
,
False
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
5
)
self
.
assertEqual
(
len
(
pserver1
.
blocks
),
6
)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
1
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 2 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"sgd"
])
# 3 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
# 2 optimize for table adam
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
2
].
ops
],
[
"adam"
,
"scale"
,
"scale"
])
# 3 optimize for table sgd
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
3
].
ops
],
[
"sgd"
])
# 4 prefetch -> lookup_sparse_table for data0
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"lookup_sparse_table"
])
#
4
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
4
].
ops
],
[
"save"
])
#
5
save table
self
.
assertEqual
([
op
.
type
for
op
in
pserver1
.
blocks
[
5
].
ops
],
[
"save"
])
trainer
,
_
=
self
.
get_trainer
(
config
)
self
.
assertEqual
(
len
(
trainer
.
blocks
),
1
)
ops
=
[
'split_ids'
,
'prefetch'
,
'merge_ids'
,
'sequence_pool'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
'sequence_pool'
,
'lookup_table'
,
'sequence_pool'
,
'concat'
,
'mul'
,
'elementwise_add'
,
'cross_entropy'
,
'mean'
,
'fill_constant'
,
'mean_grad'
,
'cross_entropy_grad'
,
'elementwise_add_grad'
,
'send'
,
'mul_grad'
,
'send'
,
'concat_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'split_selected_rows'
,
'send'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sequence_pool_grad'
,
'lookup_table_grad'
,
'sum'
,
'split_ids'
,
'send'
,
'recv'
,
'recv'
,
'recv'
,
'concat'
]
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
...
...
python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
0 → 100644
浏览文件 @
a412826c
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
def
AffineGrid
(
theta
,
size
):
n
=
size
[
0
]
h
=
size
[
2
]
w
=
size
[
3
]
h_idx
=
np
.
repeat
(
np
.
linspace
(
-
1
,
1
,
h
)[
np
.
newaxis
,
:],
w
,
axis
=
0
).
T
[:,
:,
np
.
newaxis
]
w_idx
=
np
.
repeat
(
np
.
linspace
(
-
1
,
1
,
w
)[
np
.
newaxis
,
:],
h
,
axis
=
0
)[:,
:,
np
.
newaxis
]
grid
=
np
.
concatenate
(
[
w_idx
,
h_idx
,
np
.
ones
([
h
,
w
,
1
])],
axis
=
2
)
# h * w * 3
grid
=
np
.
repeat
(
grid
[
np
.
newaxis
,
:],
size
[
0
],
axis
=
0
)
# n * h * w *3
ret
=
np
.
zeros
([
n
,
h
*
w
,
2
])
theta
=
theta
.
transpose
([
0
,
2
,
1
])
for
i
in
range
(
len
(
theta
)):
ret
[
i
]
=
np
.
dot
(
grid
[
i
].
reshape
([
h
*
w
,
3
]),
theta
[
i
])
return
ret
.
reshape
([
n
,
h
,
w
,
2
]).
astype
(
"float32"
)
def
getGridPointValue
(
data
,
x
,
y
):
data_shape
=
data
.
shape
N
=
data_shape
[
0
]
H
=
data_shape
[
2
]
W
=
data_shape
[
3
]
out
=
np
.
zeros
(
data_shape
,
dtype
=
'float'
)
for
i
in
range
(
N
):
for
j
in
range
(
H
):
for
k
in
range
(
W
):
if
y
[
i
,
j
,
k
]
<
0
or
y
[
i
,
j
,
k
]
>
H
-
1
or
x
[
i
,
j
,
k
]
<
0
or
x
[
i
,
j
,
k
]
>
W
-
1
:
out
[
i
,
:,
j
,
k
]
=
0
else
:
out
[
i
,
:,
j
,
k
]
=
data
[
i
,
:,
y
[
i
,
j
,
k
],
x
[
i
,
j
,
k
]]
return
out
def
GridSampler
(
data
,
grid
):
dims
=
data
.
shape
N
=
dims
[
0
]
C
=
dims
[
1
]
H
=
dims
[
2
]
W
=
dims
[
3
]
x
=
grid
[:,
:,
:,
0
]
y
=
grid
[:,
:,
:,
1
]
y_max
=
H
-
1
x_max
=
W
-
1
x
=
0.5
*
((
x
.
astype
(
'float32'
)
+
1.0
)
*
x_max
)
y
=
0.5
*
((
y
.
astype
(
'float32'
)
+
1.0
)
*
y_max
)
x0
=
np
.
floor
(
x
).
astype
(
'int32'
)
x1
=
x0
+
1
y0
=
np
.
floor
(
y
).
astype
(
'int32'
)
y1
=
y0
+
1
wa
=
np
.
tile
(((
x1
-
x
)
*
(
y1
-
y
)).
reshape
((
N
,
1
,
H
,
W
)),
(
1
,
C
,
1
,
1
))
wb
=
np
.
tile
(((
x1
-
x
)
*
(
y
-
y0
)).
reshape
((
N
,
1
,
H
,
W
)),
(
1
,
C
,
1
,
1
))
wc
=
np
.
tile
(((
x
-
x0
)
*
(
y1
-
y
)).
reshape
((
N
,
1
,
H
,
W
)),
(
1
,
C
,
1
,
1
))
wd
=
np
.
tile
(((
x
-
x0
)
*
(
y
-
y0
)).
reshape
((
N
,
1
,
H
,
W
)),
(
1
,
C
,
1
,
1
))
va
=
getGridPointValue
(
data
,
x0
,
y0
)
vb
=
getGridPointValue
(
data
,
x0
,
y1
)
vc
=
getGridPointValue
(
data
,
x1
,
y0
)
vd
=
getGridPointValue
(
data
,
x1
,
y1
)
out
=
(
wa
*
va
+
wb
*
vb
+
wc
*
vc
+
wd
*
vd
).
astype
(
'float32'
)
return
out
class
TestGridSamplerOp
(
OpTest
):
def
setUp
(
self
):
self
.
initTestCase
()
self
.
op_type
=
'grid_sampler'
x
=
np
.
random
.
randint
(
0
,
255
,
self
.
x_shape
).
astype
(
'float32'
)
theta
=
np
.
zeros
(
self
.
theta_shape
).
astype
(
'float32'
)
for
i
in
range
(
self
.
theta_shape
[
0
]):
for
j
in
range
(
2
):
for
k
in
range
(
3
):
theta
[
i
,
j
,
k
]
=
np
.
random
.
rand
(
1
)[
0
]
grid
=
AffineGrid
(
theta
,
self
.
x_shape
)
self
.
inputs
=
{
'X'
:
x
,
'Grid'
:
grid
}
self
.
attrs
=
{
'use_cudnn'
:
True
}
self
.
outputs
=
{
'Output'
:
GridSampler
(
x
,
grid
)}
def
test_check_output
(
self
):
self
.
check_output
(
atol
=
1e-3
)
def
test_check_grad_normal
(
self
):
self
.
check_grad
([
'X'
,
'Grid'
],
'Output'
,
max_relative_error
=
0.61
)
def
initTestCase
(
self
):
self
.
x_shape
=
(
2
,
5
,
7
,
3
)
self
.
grid_shape
=
(
2
,
7
,
3
,
2
)
self
.
theta_shape
=
(
2
,
2
,
3
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
a412826c
...
...
@@ -865,6 +865,31 @@ class TestBook(unittest.TestCase):
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_grid_sampler
(
self
):
program
=
Program
()
with
program_guard
(
program
):
x
=
layers
.
data
(
name
=
'x'
,
shape
=
[
3
,
5
,
7
],
dtype
=
'float32'
)
grid
=
layers
.
data
(
name
=
'grid'
,
shape
=
[
5
,
7
,
2
],
dtype
=
'float32'
)
out
=
layers
.
grid_sampler
(
x
,
grid
)
self
.
assertIsNotNone
(
out
)
print
(
str
(
program
))
def
test_affine_grid
(
self
):
program
=
Program
()
with
program_guard
(
program
):
data
=
layers
.
data
(
name
=
'data'
,
shape
=
[
2
,
3
,
3
],
dtype
=
"float32"
)
out
,
ids
=
layers
.
argsort
(
input
=
data
,
axis
=
1
)
theta
=
layers
.
data
(
name
=
"theta"
,
shape
=
[
2
,
3
],
dtype
=
"float32"
)
out_shape
=
layers
.
data
(
name
=
"out_shape"
,
shape
=
[
-
1
],
dtype
=
"float32"
)
data_0
=
layers
.
affine_grid
(
theta
,
out_shape
)
data_1
=
layers
.
affine_grid
(
theta
,
[
5
,
3
,
28
,
28
])
self
.
assertIsNotNone
(
data_0
)
self
.
assertIsNotNone
(
data_1
)
print
(
str
(
program
))
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
浏览文件 @
a412826c
...
...
@@ -55,6 +55,46 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
exe
.
run
(
pserver_prog
)
def
run_pserver_with_empty_block
(
use_cuda
,
sync_mode
,
ip
,
port
,
trainers
,
trainer_id
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1
,
act
=
None
,
bias_attr
=
False
)
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
# loss function
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
# optimizer
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.001
)
sgd_optimizer
.
minimize
(
avg_cost
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
ps1
=
ip
+
":"
+
str
(
int
(
port
)
+
1
)
ps2
=
ip
+
":"
+
port
pserver_endpoints
=
ps1
+
","
+
ps2
config
=
fluid
.
DistributeTranspilerConfig
()
config
.
slice_var_up
=
False
t
=
fluid
.
DistributeTranspiler
(
config
=
config
)
t
.
transpile
(
trainer_id
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
,
sync_mode
=
sync_mode
)
pserver_prog
=
t
.
get_pserver_program
(
ps2
)
# pserver2 have no parameter
assert
(
len
(
pserver_prog
.
blocks
)
==
2
)
assert
(
len
(
pserver_prog
.
blocks
[
1
].
ops
)
==
0
)
pserver_startup
=
t
.
get_startup_program
(
ps2
,
pserver_prog
)
exe
.
run
(
pserver_startup
)
exe
.
run
(
pserver_prog
)
class
TestListenAndServOp
(
OpTest
):
def
setUp
(
self
):
self
.
ps_timeout
=
5
...
...
@@ -63,9 +103,9 @@ class TestListenAndServOp(OpTest):
self
.
trainers
=
1
self
.
trainer_id
=
0
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
):
def
_start_pserver
(
self
,
use_cuda
,
sync_mode
,
pserver_func
):
p
=
Process
(
target
=
run_pserver
,
target
=
pserver_func
,
args
=
(
use_cuda
,
sync_mode
,
self
.
ip
,
self
.
port
,
self
.
trainers
,
self
.
trainer_id
))
p
.
daemon
=
True
...
...
@@ -92,7 +132,24 @@ class TestListenAndServOp(OpTest):
def
test_handle_signal_in_serv_op
(
self
):
# run pserver on CPU in sync mode
p1
=
self
.
_start_pserver
(
False
,
True
)
p1
=
self
.
_start_pserver
(
False
,
True
,
run_pserver
)
self
.
_wait_ps_ready
(
p1
.
pid
)
# raise SIGTERM to pserver
os
.
kill
(
p1
.
pid
,
signal
.
SIGINT
)
p1
.
join
()
# run pserver on CPU in async mode
p2
=
self
.
_start_pserver
(
False
,
False
,
run_pserver
)
self
.
_wait_ps_ready
(
p2
.
pid
)
# raise SIGTERM to pserver
os
.
kill
(
p2
.
pid
,
signal
.
SIGTERM
)
p2
.
join
()
def
test_list_and_serv_run_empty_optimize_block
(
self
):
# run pserver on CPU in sync mode
p1
=
self
.
_start_pserver
(
False
,
True
,
run_pserver_with_empty_block
)
self
.
_wait_ps_ready
(
p1
.
pid
)
# raise SIGTERM to pserver
...
...
@@ -100,7 +157,7 @@ class TestListenAndServOp(OpTest):
p1
.
join
()
# run pserver on CPU in async mode
p2
=
self
.
_start_pserver
(
False
,
False
)
p2
=
self
.
_start_pserver
(
False
,
False
,
run_pserver_with_empty_block
)
self
.
_wait_ps_ready
(
p2
.
pid
)
# raise SIGTERM to pserver
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
浏览文件 @
a412826c
...
...
@@ -232,6 +232,46 @@ class TestResnet(TestParallelExecutorBase):
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
if
not
use_cuda
:
return
all_reduce_first_loss_seq
,
all_reduce_last_loss_seq
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_reduce
=
False
,
optimizer
=
optimizer
,
enable_sequential_execution
=
True
)
reduce_first_loss_seq
,
reduce_last_loss_seq
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
iter
=
iter
,
batch_size
=
batch_size
,
use_cuda
=
use_cuda
,
use_reduce
=
True
,
optimizer
=
optimizer
,
enable_sequential_execution
=
True
)
for
loss
in
zip
(
all_reduce_first_loss
,
all_reduce_first_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
all_reduce_last_loss
,
all_reduce_last_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
for
loss
in
zip
(
reduce_first_loss
,
reduce_first_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
reduce_last_loss
,
reduce_last_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
for
loss
in
zip
(
all_reduce_first_loss_seq
,
reduce_first_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
for
loss
in
zip
(
all_reduce_last_loss_seq
,
reduce_last_loss_seq
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
def
_check_resnet_convergence
(
self
,
model
,
use_cuda
=
True
,
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
a412826c
...
...
@@ -173,6 +173,8 @@ class TestTransformer(TestParallelExecutorBase):
def
test_main
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
True
,
enable_sequential_execution
=
True
)
self
.
check_network_convergence
(
transformer
,
use_cuda
=
False
,
iter
=
5
)
...
...
python/paddle/fluid/tests/unittests/test_pool2d_op.py
浏览文件 @
a412826c
...
...
@@ -26,7 +26,8 @@ def max_pool2D_forward_naive(x,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
):
ceil_mode
=
False
,
exclusive
=
True
):
N
,
C
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
H
,
W
]
...
...
@@ -54,7 +55,8 @@ def avg_pool2D_forward_naive(x,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
):
ceil_mode
=
False
,
exclusive
=
True
):
N
,
C
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
H
,
W
]
...
...
@@ -73,8 +75,9 @@ def avg_pool2D_forward_naive(x,
c_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
x_masked
=
x
[:,
:,
r_start
:
r_end
,
c_start
:
c_end
]
out
[:,
:,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
))
/
(
(
r_end
-
r_start
)
*
(
c_end
-
c_start
))
field_size
=
((
r_end
-
r_start
)
*
(
c_end
-
c_start
))
if
exclusive
\
else
(
ksize
[
0
]
*
ksize
[
1
])
out
[:,
:,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
))
/
field_size
return
out
...
...
@@ -89,12 +92,13 @@ class TestPool2d_Op(OpTest):
self
.
init_kernel_type
()
self
.
init_pool_type
()
self
.
init_ceil_mode
()
self
.
init_exclusive
()
if
self
.
global_pool
:
self
.
paddings
=
[
0
for
_
in
range
(
len
(
self
.
paddings
))]
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
output
=
self
.
pool2D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mod
e
).
astype
(
self
.
dtype
)
output
=
self
.
pool2D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mode
,
self
.
exclusiv
e
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
attrs
=
{
...
...
@@ -106,7 +110,9 @@ class TestPool2d_Op(OpTest):
'use_cudnn'
:
self
.
use_cudnn
,
'use_mkldnn'
:
self
.
use_mkldnn
,
'ceil_mode'
:
self
.
ceil_mode
,
'data_format'
:
'AnyLayout'
# TODO(dzhwinter) : should be fix latter
'data_format'
:
'AnyLayout'
,
# TODO(dzhwinter) : should be fix latter
'exclusive'
:
self
.
exclusive
}
self
.
outputs
=
{
'Out'
:
output
}
...
...
@@ -150,6 +156,9 @@ class TestPool2d_Op(OpTest):
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
False
def
init_exclusive
(
self
):
self
.
exclusive
=
True
class
TestCase1
(
TestPool2d_Op
):
def
init_test_case
(
self
):
...
...
@@ -322,5 +331,15 @@ class TestCeilModeCase4(TestCase2):
self
.
ceil_mode
=
True
class
TestAvgInclude
(
TestCase2
):
def
init_exclusive
(
self
):
self
.
exclusive
=
False
class
TestCUDNNAvgInclude
(
TestCUDNNCase3
):
def
init_exclusive
(
self
):
self
.
exclusive
=
False
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_pool3d_op.py
浏览文件 @
a412826c
...
...
@@ -26,7 +26,8 @@ def max_pool3D_forward_naive(x,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
):
ceil_mode
=
False
,
exclusive
=
True
):
N
,
C
,
D
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
D
,
H
,
W
]
...
...
@@ -60,7 +61,8 @@ def avg_pool3D_forward_naive(x,
strides
,
paddings
,
global_pool
=
0
,
ceil_mode
=
False
):
ceil_mode
=
False
,
exclusive
=
True
):
N
,
C
,
D
,
H
,
W
=
x
.
shape
if
global_pool
==
1
:
ksize
=
[
D
,
H
,
W
]
...
...
@@ -85,8 +87,10 @@ def avg_pool3D_forward_naive(x,
w_end
=
np
.
min
((
j
*
strides
[
1
]
+
ksize
[
1
]
-
paddings
[
1
],
W
))
x_masked
=
x
[:,
:,
d_start
:
d_end
,
h_start
:
h_end
,
w_start
:
w_end
]
out
[:,
:,
k
,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
,
4
))
/
(
(
d_end
-
d_start
)
*
(
h_end
-
h_start
)
*
(
w_end
-
w_start
))
field_size
=
(
d_end
-
d_start
)
*
(
h_end
-
h_start
)
*
(
w_end
-
w_start
)
\
if
exclusive
else
ksize
[
0
]
*
ksize
[
1
]
*
ksize
[
2
]
out
[:,
:,
k
,
i
,
j
]
=
np
.
sum
(
x_masked
,
axis
=
(
2
,
3
,
4
))
/
field_size
return
out
...
...
@@ -100,13 +104,14 @@ class TestPool3d_Op(OpTest):
self
.
init_kernel_type
()
self
.
init_pool_type
()
self
.
init_ceil_mode
()
self
.
init_exclusive
()
if
self
.
global_pool
:
self
.
paddings
=
[
0
for
_
in
range
(
len
(
self
.
paddings
))]
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
output
=
self
.
pool3D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mod
e
).
astype
(
self
.
dtype
)
output
=
self
.
pool3D_forward_naive
(
input
,
self
.
ksize
,
self
.
strides
,
self
.
paddings
,
self
.
global_pool
,
self
.
ceil_mode
,
self
.
exclusiv
e
).
astype
(
self
.
dtype
)
self
.
inputs
=
{
'X'
:
OpTest
.
np_dtype_to_fluid_dtype
(
input
)}
self
.
attrs
=
{
...
...
@@ -117,7 +122,9 @@ class TestPool3d_Op(OpTest):
'global_pooling'
:
self
.
global_pool
,
'use_cudnn'
:
self
.
use_cudnn
,
'ceil_mode'
:
self
.
ceil_mode
,
'data_format'
:
'AnyLayout'
# TODO(dzhwinter) : should be fix latter
'data_format'
:
'AnyLayout'
,
# TODO(dzhwinter) : should be fix latter
'exclusive'
:
self
.
exclusive
}
self
.
outputs
=
{
'Out'
:
output
}
...
...
@@ -161,6 +168,9 @@ class TestPool3d_Op(OpTest):
def
init_ceil_mode
(
self
):
self
.
ceil_mode
=
False
def
init_exclusive
(
self
):
self
.
exclusive
=
True
class
TestCase1
(
TestPool3d_Op
):
def
init_test_case
(
self
):
...
...
@@ -333,5 +343,15 @@ class TestCeilModeCase4(TestCase2):
self
.
ceil_mode
=
True
class
TestAvgInclude
(
TestCase2
):
def
init_exclusive
(
self
):
self
.
exclusive
=
False
class
TestCUDNNAvgInclude
(
TestCUDNNCase3
):
def
init_exclusive
(
self
):
self
.
exclusive
=
False
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py
0 → 100644
浏览文件 @
a412826c
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
from
threading
import
Thread
def
user_reader
(
inputs
):
def
_reader
():
for
d
in
inputs
:
yield
d
return
_reader
def
batch_feeder
(
batch_reader
,
pin_memory
=
False
,
img_dtype
=
"float32"
):
def
_feeder
():
for
batch_data
in
batch_reader
():
sample_batch
=
[]
label_batch
=
[]
for
sample
,
label
in
batch_data
:
sample_batch
.
append
(
sample
)
label_batch
.
append
([
label
])
tensor
=
core
.
LoDTensor
()
label
=
core
.
LoDTensor
()
place
=
core
.
CUDAPinnedPlace
()
if
pin_memory
else
core
.
CPUPlace
()
tensor
.
set
(
np
.
array
(
sample_batch
,
dtype
=
img_dtype
),
place
)
label
.
set
(
np
.
array
(
label_batch
,
dtype
=
"int64"
),
place
)
yield
[
tensor
,
label
]
return
_feeder
class
TestPyReader
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
capacity
=
10
self
.
shapes
=
[(
-
1
,
3
,
2
,
1
),
(
-
1
,
1
)]
self
.
lod_levels
=
[
0
,
0
]
self
.
dtypes
=
[
'float32'
,
'int64'
]
def
test_pin_memory_pyreader
(
self
):
with
fluid
.
program_guard
(
fluid
.
Program
(),
fluid
.
Program
()):
place
=
fluid
.
CUDAPlace
(
0
)
if
fluid
.
core
.
is_compiled_with_cuda
(
)
else
fluid
.
CPUPlace
()
executor
=
fluid
.
Executor
(
place
)
data_file
=
fluid
.
layers
.
py_reader
(
capacity
=
self
.
capacity
,
dtypes
=
self
.
dtypes
,
lod_levels
=
self
.
lod_levels
,
shapes
=
self
.
shapes
)
# feed_queue = data_file.queue
read_out_data
=
fluid
.
layers
.
read_file
(
data_file
)
self
.
inputs
=
[]
for
_
in
range
(
10
):
sample
=
np
.
random
.
uniform
(
low
=
0
,
high
=
1
,
size
=
[
3
,
2
,
1
]).
astype
(
"float32"
)
label
=
np
.
random
.
uniform
(
low
=
0
,
high
=
10
,
size
=
[
1
]).
astype
(
"int64"
)
self
.
inputs
.
append
((
sample
,
label
))
self
.
input_tensors
=
[]
for
d
,
l
in
batch_feeder
(
paddle
.
batch
(
user_reader
(
self
.
inputs
),
batch_size
=
2
),
pin_memory
=
True
if
fluid
.
core
.
is_compiled_with_cuda
()
else
False
)():
ta
=
fluid
.
LoDTensorArray
()
ta
.
append
(
d
)
ta
.
append
(
l
)
self
.
input_tensors
.
append
(
ta
)
self
.
batched_inputs
=
[]
for
batch
in
paddle
.
batch
(
user_reader
(
self
.
inputs
),
batch_size
=
2
)():
feed_d
=
[]
feed_l
=
[]
for
d
,
l
in
batch
:
feed_d
.
append
(
d
)
feed_l
.
append
([
l
])
self
.
batched_inputs
.
append
([
feed_d
,
feed_l
])
data_file
.
decorate_tensor_provider
(
batch_feeder
(
paddle
.
batch
(
user_reader
(
self
.
inputs
),
batch_size
=
2
),
pin_memory
=
True
if
fluid
.
core
.
is_compiled_with_cuda
()
else
False
))
executor
.
run
(
fluid
.
default_startup_program
())
self
.
outputs
=
[]
data_file
.
start
()
for
_
in
self
.
input_tensors
:
self
.
outputs
.
append
(
executor
.
run
(
fetch_list
=
list
(
read_out_data
)))
data_file
.
reset
()
self
.
validate
()
def
validate
(
self
):
self
.
assertEqual
(
len
(
self
.
batched_inputs
),
len
(
self
.
outputs
))
for
in_data_list
,
out_data_list
in
zip
(
self
.
batched_inputs
,
self
.
outputs
):
self
.
assertEqual
(
len
(
in_data_list
),
len
(
out_data_list
))
in_data_list_np
=
[
np
.
array
(
in_lod_tensor
)
for
in_lod_tensor
in
in_data_list
]
for
in_data
,
out_data
in
zip
(
in_data_list_np
,
out_data_list
):
self
.
assertTrue
((
in_data
==
out_data
).
all
())
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_seq_pool.py
浏览文件 @
a412826c
...
...
@@ -184,6 +184,20 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D):
out
[
i
]
=
np
.
reshape
(
np
.
amax
(
sub_x
,
axis
=
0
),
(
3
,
11
))
class
TestSeqMaxPool2DInference
(
TestSeqMaxPool2D
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"MAX"
,
'is_test'
:
True
}
for
i
in
range
(
len
(
offset
[
0
])
-
1
):
sub_x
=
np
.
reshape
(
x
[
offset
[
0
][
i
]:
offset
[
0
][
i
+
1
],
:],
(
-
1
,
3
*
11
))
out
[
i
]
=
np
.
reshape
(
np
.
amax
(
sub_x
,
axis
=
0
),
(
3
,
11
))
def
test_check_grad
(
self
):
"""Grad computation does not apply to Sequence MAX
Pool executed when is_test is true """
return
class
TestSeqLastPool2D
(
TestSeqAvgPool2D
):
def
compute
(
self
,
x
,
offset
,
out
):
self
.
attrs
=
{
'pooltype'
:
"LAST"
}
...
...
python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
浏览文件 @
a412826c
...
...
@@ -26,7 +26,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
Test softmax with cross entropy operator with discreate one-hot labels.
"""
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
def
setUp
(
self
):
self
.
initParams
()
self
.
op_type
=
"softmax_with_cross_entropy"
batch_size
=
41
class_num
=
37
...
...
@@ -46,6 +50,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
"Softmax"
:
softmax
.
astype
(
"float64"
),
"Loss"
:
cross_entropy
.
astype
(
"float64"
)
}
self
.
attrs
=
{
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -54,6 +59,11 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
self
.
check_grad
([
"Logits"
],
"Loss"
)
class
TestSoftmaxWithCrossEntropyOpNoCudnn
(
TestSoftmaxWithCrossEntropyOp
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
True
class
TestSoftmaxWithCrossEntropyOp2
(
OpTest
):
"""
Test softmax with cross entropy operator with soft labels.
...
...
@@ -93,7 +103,11 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
Test softmax with cross entropy operator with ignore_index.
"""
def
initParams
(
self
):
self
.
numeric_stable_mode
=
False
def
setUp
(
self
):
self
.
initParams
()
self
.
op_type
=
"softmax_with_cross_entropy"
batch_size
=
41
class_num
=
37
...
...
@@ -114,7 +128,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
"Softmax"
:
softmax
.
astype
(
"float64"
),
"Loss"
:
cross_entropy
.
astype
(
"float64"
)
}
self
.
attrs
=
{
"ignore_index"
:
ignore_index
}
self
.
attrs
=
{
"ignore_index"
:
ignore_index
,
"numeric_stable_mode"
:
self
.
numeric_stable_mode
}
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -123,5 +140,10 @@ class TestSoftmaxWithCrossEntropyOp3(OpTest):
self
.
check_grad
([
"Logits"
],
"Loss"
)
class
TestSoftmaxWithCrossEntropyOp3NoCudnn
(
TestSoftmaxWithCrossEntropyOp3
):
def
initParams
(
self
):
self
.
numeric_stable_mode
=
True
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
a412826c
...
...
@@ -35,6 +35,7 @@ import sys
import
numpy
as
np
import
collections
import
six
import
logging
from
.ps_dispatcher
import
RoundRobin
,
HashName
,
PSDispatcher
from
..
import
core
,
framework
...
...
@@ -767,6 +768,15 @@ in a single call.")
prefetch_var_name_to_block_id
.
extend
(
lookup_table_var_name_to_block_id
)
if
len
(
optimize_blocks
)
==
0
:
logging
.
warn
(
"pserver ["
+
str
(
endpoint
)
+
"] has no optimize block!!"
)
pre_block_idx
=
pserver_program
.
num_blocks
-
1
empty_block
=
pserver_program
.
_create_block
(
pre_block_idx
)
optimize_blocks
.
append
(
empty_block
)
# In some case, some parameter server will have no parameter to optimize
# So we give an empty optimize block to parameter server.
attrs
=
{
"optimize_blocks"
:
optimize_blocks
,
"endpoint"
:
endpoint
,
...
...
@@ -910,11 +920,11 @@ to transpile() call.")
block_idx
=
int
(
block_name
.
split
(
block_suffix
)[
1
])
orig_var
=
self
.
origin_program
.
global_block
().
vars
[
orig_var_name
]
skip_
numel
=
0
skip_
dim0
=
0
slice_vars
=
self
.
param_var_mapping
[
orig_var_name
]
for
slice_var
in
slice_vars
[:
block_idx
]:
skip_
numel
+=
reduce
(
lambda
x
,
y
:
x
*
y
,
slice_var
.
shape
)
slice_vars_and_attrs
.
append
([
orig_var
,
skip_
numel
,
param
])
skip_
dim0
+=
slice_var
.
shape
[
0
]
slice_vars_and_attrs
.
append
([
orig_var
,
skip_
dim0
,
param
])
return
slice_vars_and_attrs
...
...
@@ -1065,7 +1075,12 @@ to transpile() call.")
continue_search_lookup_table_op
=
False
all_ops
=
program
.
global_block
().
ops
for
op
in
all_ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
and
self
.
table_name
==
op
.
input
(
"W"
)[
0
]:
if
not
op
.
attr
(
'is_distributed'
):
raise
RuntimeError
(
"lookup_table_op that lookup an distributed embedding table"
"should set is_distributed to true"
)
continue_search_lookup_table_op
=
True
lookup_table_op_index
=
lookup_table_op_index
if
lookup_table_op_index
!=
-
1
else
list
(
...
...
@@ -1275,7 +1290,6 @@ to transpile() call.")
}
outputs
=
{
"ParamOut"
:
[
param_var
]}
# only support sgd now
import
logging
logging
.
warn
(
"distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of "
+
table_opt_op
.
type
)
...
...
@@ -1442,6 +1456,9 @@ to transpile() call.")
elif
op_type
==
"decayed_adagrad"
:
if
varkey
==
"Moment"
:
return
param_shape
elif
op_type
==
"ftrl"
:
if
varkey
in
[
"SquaredAccumulator"
,
"LinearAccumulator"
]:
return
param_shape
elif
op_type
==
"sgd"
:
pass
else
:
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
a412826c
...
...
@@ -61,6 +61,9 @@ class InferenceTranspiler(object):
raise
TypeError
(
"scope should be as Scope type or None"
)
use_mkldnn
=
bool
(
os
.
getenv
(
"FLAGS_use_mkldnn"
,
False
))
if
use_mkldnn
:
self
.
_depthwise_conv_mkldnn
(
program
)
self
.
_fuse_batch_norm
(
program
,
place
,
scope
)
if
use_mkldnn
:
self
.
_fuse_conv_bias_mkldnn
(
program
)
...
...
@@ -70,6 +73,31 @@ class InferenceTranspiler(object):
program
)
# ResNet residual block merging
self
.
_fuse_bn_relu_mkldnn
(
program
)
def
_depthwise_conv_mkldnn
(
self
,
program
):
'''
Transpile the program by replacing depthwise_conv2d to conv2d for MKLDNN program.
The result is:
- before:
- any_other_op->depthwise_conv->any_other_op
- after:
- any_other_op->conv->any_other_op
:param program: program to transpile
:type program: Program
'''
self
.
block
=
program
.
block
(
0
)
i
=
0
while
i
<
len
(
self
.
block
.
ops
):
current_op
=
self
.
block
.
ops
[
i
]
if
current_op
.
type
==
'depthwise_conv2d'
:
current_op
.
desc
.
set_type
(
"conv2d"
)
i
=
i
+
1
# TODO(luotao): use clone() method to flush the program.desc in force,
# since some large program.desc will not be flushed immediately.
# And a better solution will be considered later.
program
=
program
.
clone
()
def
_fuse_conv_eltwise_mkldnn
(
self
,
program
):
'''
Transpile the program fusing elementwise_add into conv for MKLDNN
...
...
python/setup.py.in
浏览文件 @
a412826c
...
...
@@ -27,7 +27,7 @@ def _get_version_detail(idx):
if re.match('@TAG_VERSION_REGEX@', '@PADDLE_VERSION@'):
version_details = '@PADDLE_VERSION@'.split('.')
if len(version_details)
=
= 3:
if len(version_details)
>
= 3:
return version_details[idx]
return 0
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录